自己封装的爬虫基础类。
class="java" name="code">
public interface TaskBaseInfo {
/**
* 返回任务的名称.
* <br/>
* 一般用作日志输出
* @return
*/
String taskName();
/**
* 返回任务的唯一code
* @return 在整个爬虫项目中不重复的Code值
*/
String taskCode();
}
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.ExecutionException;
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import redis.clients.jedis.JedisCluster;
public interface TaskStringCache {
Logger logger = LoggerFactory.getLogger(TaskStringCache.class);
String BASE_FILE_PATH = "/mfs/ShareFile/static/cms/crawler/cache/";
JedisCluster obtainJedisCluster();
String getCacheStr(String taskCode, String cacheKey);
void setCacheStr(String taskCode, String cacheKey, int cacheSeconds, String cacheStr);
default String obtainTargetFilePath(String taskCode, String cacheKey) {
return BASE_FILE_PATH + taskCode + File.pathSeparator + cacheKey + ".properties";
}
/**
* 设置缓存的默认方法
* @param taskName 任务中文名,日志使用
* @param taskCode 任务code需保持唯一性
* @param cacheKey 缓存的key
* @param cacheStr 缓存的值
*/
default void defaultSetCacheStr(String taskName, String taskCode, String cacheKey, int cacheSeconds, String cacheStr) {
JedisCluster jedisCluster = obtainJedisCluster();
jedisCluster.setex(cacheKey, cacheSeconds, cacheStr);
String targetFilePath = obtainTargetFilePath(taskCode, cacheKey);
save2FileAtomic(taskName, targetFilePath, cacheStr);
}
/**
* 获取通过【设置缓存的默认方法】{@link #defaultSetCacheStr(String, String, String, String)}设置的缓存
* @param taskName 任务中文名,日志使用
* @param taskCode 任务code需保持唯一性
* @param cacheKey 缓存的key
* @return
*/
default String defaultGetCacheStr(String taskName, String taskCode, String cacheKey) {
JedisCluster jedisCluster = obtainJedisCluster();
String cacheStr = jedisCluster.get(cacheKey);
if (StringUtils.isNotBlank(cacheStr)) {
return cacheStr;
}
String targetFilePath = obtainTargetFilePath(taskCode, cacheKey);
try {
// 没利用到多少异步的优势,执行异步操作后马上获取结果还是会阻塞
cacheStr = readFile(targetFilePath).get();
} catch (InterruptedException | ExecutionException e) {
logger.error("【" + taskName + "】 执行异步获取文件缓存内容时失败. taskCode=>" + "【" + taskCode + "】" + " cacheKey=>" + "【" + cacheKey + "】");
logger.error(e.getMessage());
}
return cacheStr;
}
/**
* 通过文件持久化爬取的游标Id,避免在数据增加字段
* 文件写入操作较慢,异步执行
* 原子操作,避免写入和读取的并发问题
*
* @param filePath
* @return
*/
default void save2FileAtomic(String taskName, String filePath, String content) {
CompletableFuture.runAsync(() -> {
File tmpFile = new File(filePath + ".tmp");
try {
if (tmpFile.exists() == false) {
tmpFile.getParentFile().mkdirs();
tmpFile.createNewFile();
}
try (FileWriter fw = new FileWriter(tmpFile)) {
fw.write(content);
fw.flush();
}
} catch (IOException e) {
logger.error("【" + taskName + "】 => 写入缓存字符串到文件 => 【" + tmpFile + "】 时异常 \n" + e.getMessage());
logger.error("【" + taskName + "】 文件写入操作退出");
if (tmpFile.exists()) {
tmpFile.delete();
}
return;
}
if (tmpFile.exists() == false) {
return;
}
// 此段注释针对windows系统在同一个文件系统内且是同一个盘符下已经有一个目标文件;
// 下面的renameTo操作会失败,造成无限递归调用进而 【栈溢出】 异常
// 在Linux运行的情况下,可暂时先注释掉,测试没问题后上线
// 注释开始段
// File destFile = new File(filePath);
// if (destFile.exists()) {
// destFile.delete();
// }
// 注释结束段
if (tmpFile.renameTo(new File(filePath))) {
tmpFile.delete();
} else {
logger.error("move fails filePath:" + filePath);
tmpFile.delete();
this.save2FileAtomic(taskName, filePath, content);
// 当在Linux某个发行版下测试时,renameTo操作出错的话,可不硬性要求原子操作,
// 可将上面的原子操作注释掉,改为下面的操作
// save2File(filePath, content);
}
});
}
// default void save2File(String filePath, String content) throws IOException {
//
// try (FileWriter fw = new FileWriter(new File(filePath))) {
//
// fw.write(content);
// fw.flush();
// }
// }
/**
* 异步读取文件内容
*
* @param filePath
* @return
* @throws IOException
* @throws FileNotFoundException
*/
default CompletableFuture<String> readFile(String filePath) {
return CompletableFuture.supplyAsync(() -> {
StringBuilder strb = new StringBuilder();
try (FileInputStream fis = new FileInputStream(filePath);
BufferedReader inReader = new BufferedReader(new InputStreamReader(fis));) {
String line = StringUtils.EMPTY;
while ((line = inReader.readLine()) != null) {
strb.append(line);
}
} catch (IOException e) {
logger.error(e.getMessage());
return StringUtils.EMPTY;
}
return strb.toString();
});
}
}
public interface BasicTask {
void run();
}
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.BeansException;
import org.springframework.context.ApplicationContext;
import org.springframework.context.ApplicationContextAware;
import com.xxx.zx.crawler.basic.BasicTask;
public abstract class BaseCrawlerTask implements TaskBaseInfo, TaskStringCache, BasicTask, ApplicationContextAware {
protected final Logger logger = LoggerFactory.getLogger(getClass());
protected static ApplicationContext ac;
@Override
public void setApplicationContext(ApplicationContext applicationContext) throws BeansException {
ac = applicationContext;
}
public synchronized static <T> T getBean(Class<T> beanClass) {
return ac.getBean(beanClass);
}
public synchronized static Object getBean(String beanName) {
return ac.getBean(beanName);
}
@Override
public String getCacheStr(String taskCode, String cacheKey) {
return defaultGetCacheStr(taskName(), taskCode, cacheKey);
}
@Override
public void setCacheStr(String taskCode, String cacheKey, int cacheSeconds, String cacheStr) {
defaultSetCacheStr(taskName(), taskCode, cacheKey, cacheSeconds, cacheStr);
}
}