基于nodejs网站爬虫程序
yarn #npm install
+-- src
| HtmlDownloader //网页下载器
| HtmlParser //网页解析器
| Outputer //内容输出
| UrlManager //url管理
| main //主入口和调度
//main.js
class ParserScheduler {
constructor() {
this.parseCount = 0;
this.urls = new UrlManager();
this.downloader = new HtmlDownloader();
this.outputer = new Outputer();
this.parser = new HtmlParser();
}
parse() {
const {urls, downloader, parser, outputer} = this;
let newUrl = urls.getNewUrl();
console.log('new url:' + newUrl);
return downloader
.download(newUrl)
.then(html => {
const [newUrls,content] = parser.parse(html);
urls.addNewUrls(newUrls);
outputer.collectData(content);
if(this.urls.hasNewUrl()) {
this.parse();
this.parseCount++;
}else{
console.log('complete!');
this.outputer.output();
}
});
}
start(count) {
this.urls.addNewUrl(rootUrl);
this.parse();
}
}
//parser
class HtmlParser {
parse(html) {
let aLinks = [];
let images = [];
const $ = cheerio.load(html);
$('.text-page-tag').each((index, item) => {
let href = item.attribs.href;
if (href.indexOf('/course/list') >= 0) {
aLinks.push(href);
}
});
$('.course-banner').each((index,item)=>{
const src = item.attribs.src;
const alt = $(item).closest('.course-card-container')
.find('.course-card-name').text();
if (src) {
images.push({src, alt});
}
});
return [aLinks, images];
}
}
//downloader
class HtmlDownloader {
download(url) {
return new Promise((resolve, reject) => {
request(url, {
headers: {
'User-Agent': 'Mozilla/5.0',
},
}, (error, response, body) => {
if (error) {
reject(error);
}
resolve(body);
});
});
}
}
//urlmanager
class UrlManager {
constructor() {
this.newUrls = [];
this.oldUrls = [];
}
hasNewUrl() {
return this.newUrls.length !== 0;
}
getNewUrl() {
const url = this.newUrls.shift();
if (!this.oldUrls.includes(url)) {
this.oldUrls.push(url);
}
return url;
}
addNewUrl(url) {
const {newUrls} = this;
url = (url.indexOf('http')>=0)?url:('http://www.imooc.com'+url);
if (!newUrls.includes(url) && !this.oldUrls.includes(url)) {
newUrls.push(url);
}
}
addNewUrls(urls) {
if (Array.isArray(urls)) {
urls.forEach(url=>{
this.addNewUrl(url);
});
}
}
}
//outputer
class Outputer {
constructor() {
this.data = [];
}
_getImage(url, filename) {
console.log('写入图片文件:'+filename);
url = url.indexOf('http:')>=0?url:('http:'+url);
let bufferArray = [];
const opts = {
headers: {
'User-Agent': 'Mozilla/5.0',
},
};
request(url).pipe(fs.createWriteStream('D:\parser_pics\\'+encodeURIComponent(filename)+'.jpg'))
}
collectData(datas) {
if (datas && Array.isArray(datas)) {
datas.forEach(data => {
this
.data
.push(data);
});
}
}
output() {
// console.log('output');
const {data} = this;
for (let i = 0, len = data.length; i < len; i++) {
let item = data[i];
try{
this._getImage(item.src, item.alt);
}catch(e){
console.log(e);
}
}
}
}
node main.js
目前支持的nodejs版本为node 10.0.0