Nodejs爬虫实战(五)
1. 抓取标签内容
引入模块
新模块
jsdom
中的JSDOM
创建对象
let DOM = new JSDOM(html); let document = DOM.window.document;
dom
操作document.querySelector('.tm-count').innerHTML
###### 完整代码
var index = 0;
const fs = require('fs');
const url = require('url');
const gbk = require('gbk');
const JSDOM = require('jsdom').JSDOM;
GetUrl('https://detail.tmall.com/item.htm?id=548466958386&ali_refid=a3_430583_1006:1103419234:N:%E5%8D%8E%E4%B8%BA:bb84ee4c8f67c7b202d725187b7ad429&ali_trackid=1_bb84ee4c8f67c7b202d725187b7ad429&spm=a230r.1.14.1&sku_properties=5919063:6536025;12304035:116177',(data)=>{
var html = gbk.toString('utf-8',data);
let DOM = new JSDOM(html);
let document = DOM.window.document;
console.log(document.querySelector('.tm-count').innerHTML)
})
function GetUrl(sUrl,success){
index++;
var urlObj = url.parse(sUrl);
var http ='';
if(urlObj.protocol == 'http:'){
http = require('http');
}
else{
http = require('https');
}
let req = http.request({
'hostname':urlObj.hostname,
'path':urlObj.path
},res=>{
if(res.statusCode == 200){
var arr = [];
var str = '';
res.on('data',buffer=>{
arr.push(buffer);
//str +=buffer;
});
res.on('end',()=>{
let b = Buffer.concat(arr);
success && success(b);
})
}
else if(res.statusCode == 302 || res.statusCode == 301){
console.log(`第${index}次重定向`,res.headers.location);
GetUrl(res.headers.location,success)
}
});
req.end();
req.on('error',()=>{
console.log('404');
})
}
原文作者: 冯亚杰(DanBoard·Feng)
原文链接: http://danbo3110.github.io/2019/10/23/Nodejs爬虫实战(五)/
版权声明: 转载请注明出处(必须保留作者署名及链接)