获取BBS十大热门话题方法探讨(二)

最近用手机登录了一下饮水思源BBS,结果发现了一种更为简便抓取十大热门话题标题和链接的方法。

上篇文章在探讨这个问题时,由于框架的存在,并不能直接用https模块获取整个网页内容。但是,最近用手机登录时发现了一个没有框架并且直接是十大热门话题的网页,接下来就好办了,代码如下:

var https = require('https');
var iconv = require('iconv-lite');
var cheerio = require('cheerio');
var publicUrl = "https://bbs.sjtu.edu.cn";
var indexUrl = "https://bbs.sjtu.edu.cn/file/bbs/mobile/top100.html";
var topicTitles = [];
var topicUrls = [];

https.get(indexUrl, function(res) {
res.setEncoding('binary');
var source = "";
res.on('data', function(data) {
source += data;
});
res.on('end', function() {
var buf = new Buffer(source, 'binary');
var str = iconv.decode(buf, 'GBK');
var $ = cheerio.load(str);
$("a:odd").each(function() {
var topicTitle = $(this).text();
var topicTitleStop = topicTitle.length - 122;
topicTitle = topicTitle.substring(0, topicTitleStop);
var topicUrl = publicUrl + $(this).attr("href");
topicTitles.push(topicTitle);
topicUrls.push(topicUrl);
});
console.log(topicTitles);
console.log(topicUrls);
});
});

先用httpsget()方法获取整个网页,再用cheerio处理得到标题和链接。由于Node.js只支持utf-8编码,中间采用了iconv-lite模块,官方这样描述道:

Pure JS character encoding conversion

运行上述代码,报错:

SyntaxError: unmatched pseudo-class :odd

原因是虽然cheeriojQuery的用法一样一样的,但有些jQuery的选择器并不被cheerio支持,如:odd。在Stack Overflow中找到了解决方法,修改后的代码如下:

var https = require('https');
var iconv = require('iconv-lite');
var cheerio = require('cheerio');
var publicUrl = "https://bbs.sjtu.edu.cn";
var indexUrl = "https://bbs.sjtu.edu.cn/file/bbs/mobile/top100.html";
var topics = [];

https.get(indexUrl, function(res) {
res.setEncoding('binary');
var source = "";
res.on('data', function(data) {
source += data;
});
res.on('end', function() {
var buf = new Buffer(source, 'binary');
var str = iconv.decode(buf, 'GBK');
var $ = cheerio.load(str);
cheerio.prototype.odd = function() {
var odds = [];
this.each(function(index, item) {
if (index % 2 == 1) {
odds.push(item);
}
});
return cheerio(odds);
};
$("a").odd().each(function() {
var topicTitle = $(this).text();
var topicTitleStop = topicTitle.length - 122;
topicTitle = topicTitle.substring(0, topicTitleStop);
var topicUrl = publicUrl + $(this).attr("href");
var topic = {
topicTitle: topicTitle,
topicUrl: topicUrl
};
topics.push(topic);
});
console.log(topics);
});
});

这样错误消除了,结果输出也更直观,如下:

[ { topicTitle: 'F又变身OMO',
topicUrl: 'https://bbs.sjtu.edu.cn/bbswaptcon?board=SJTUNews&reid=1468554320
' },
{ topicTitle: '证明高数上的一个定理',
topicUrl: 'https://bbs.sjtu.edu.cn/bbswaptcon?board=joke&reid=1468519051' },

{ topicTitle: '成熟靠谱男青年征婚~盐城的北大毕业定居国外',
topicUrl: 'https://bbs.sjtu.edu.cn/bbswaptcon?board=LoveBridge&reid=14685147
91' },
{ topicTitle: '欧美多恐怖袭击死伤数百显社会主义中国优越性',
topicUrl: 'https://bbs.sjtu.edu.cn/bbswaptcon?board=SJTUNews&reid=1468551449
' },
{ topicTitle: '李政道图书馆的参观问题',
topicUrl: 'https://bbs.sjtu.edu.cn/bbswaptcon?board=Library&reid=1468546291'
},
{ topicTitle: '第68届美国电视黄金时段艾美奖完整提名',
topicUrl: 'https://bbs.sjtu.edu.cn/bbswaptcon?board=WesternTV&reid=146855382
6' },
{ topicTitle: '左前轮和左后轮补胎,可以把左前换到右后吗?',
topicUrl: 'https://bbs.sjtu.edu.cn/bbswaptcon?board=automobile&reid=14685482
71' },
{ topicTitle: '[挂牌][]92年妹子诚征另一半',
topicUrl: 'https://bbs.sjtu.edu.cn/bbswaptcon?board=LoveBridge&reid=14685606
81' },
{ topicTitle: '【代挂】87年张江IT男诚意寻有缘MM',
topicUrl: 'https://bbs.sjtu.edu.cn/bbswaptcon?board=LoveBridge&reid=14685715
40' },
{ topicTitle: '推荐一个提供免费VPN账号的网站 http://xfree.p',
topicUrl: 'https://bbs.sjtu.edu.cn/bbswaptcon?board=Graduate&reid=1468516809
' } ]

仅供学习交流,以上。