获取贴吧热贴方法探讨

最近想对交大贴吧中的贴子进行获取和筛选,具体想法是打算从最新的100贴中选出热门top10。最后,用Node.js实现了这一想法。

从贴吧第一页开始

贴吧一页默认50贴,先尝试获取第一页的。贴吧贴子的内容都在网页源代码中,很容易用http模块的get()方法实现,再用cheerio处理,代码如下:

var http = require('http');
var cheerio = require('cheerio');

var publicUrl = "http://tieba.baidu.com";
var indexUrl = "http://tieba.baidu.com/f?kw=%E4%B8%8A%E6%B5%B7%E4%BA%A4%E9%80%9A%E5%A4%A7%E5%AD%A6&ie=utf-8&pn=0";
var replyNums = [];
var replyNumsTemp = [];
var topicTitles = [];
var topicUrls = [];
var top10Topics = [];

http.get(indexUrl, function(res) {
var source = "";
res.on('data', function(data) {
source += data;
});
res.on('end', function() {
var $ = cheerio.load(source);
$("li.j_thread_list").each(function() {
var replyNum = $(this).children("div").children("div.j_threadlist_li_left").children("span.threadlist_rep_num").text();
var $topicA = $(this).children("div").children("div.j_threadlist_li_right").children("div.threadlist_lz").children("div.threadlist_title").children("a.j_th_tit");
var topicTitle = "[回贴" + replyNum + "]" + $topicA.text();
var topicUrl = publicUrl + $topicA.attr("href");
replyNums.push(replyNum);
replyNumsTemp.push(replyNum);
topicTitles.push(topicTitle);
topicUrls.push(topicUrl);
});

replyNumsTemp.sort(sortNumber);

for (i = 0; i <= 9; i++) {
var indexNum = replyNums.indexOf(replyNumsTemp[i]);
var top10Topic = {
topicTitle: topicTitles[indexNum],
topicUrl: topicUrls[indexNum]
};
top10Topics.push(top10Topic);
}
console.log(top10Topics);
});
});

function sortNumber(a, b) {
return b - a;
}

可以看出,通过cheerio得到了所有贴子的回贴数、标题和链接。随后对数组replyNumsTemp按照数值从大到小进行了排序,再通过检索位置选出热贴top10。

结果如下:

[ { topicTitle: '[回贴240557]【集中发言】上海交通大学吧水图专用帖⑤',
topicUrl: 'http://tieba.baidu.com/p/2270874764' },
{ topicTitle: '[回贴3031]【考研&保研】研究生招生类,考研求购类主题专贴',
topicUrl: 'http://tieba.baidu.com/p/4007342584' },
{ topicTitle: '[回贴548]【新生向】终极问题大集合2016版,16级新同学有任何问题欢
迎戳进',
topicUrl: 'http://tieba.baidu.com/p/4630809912' },
{ topicTitle: '[回贴490]闺蜜的男票我也是呵呵了。。',
topicUrl: 'http://tieba.baidu.com/p/4668702428' },
{ topicTitle: '[回贴439]所以,各位男生是喜欢什么类型的女生呢[���真脸]',
topicUrl: 'http://tieba.baidu.com/p/4661542394' },
{ topicTitle: '[回贴405]你们接着膜,再膜一个试试。',
topicUrl: 'http://tieba.baidu.com/p/4563718277' },
{ topicTitle: '[回贴360]大四学渣来送书了',
topicUrl: 'http://tieba.baidu.com/p/4631738333' },
{ topicTitle: '[回贴243]我要想同学们做出承诺!',
topicUrl: 'http://tieba.baidu.com/p/4194058373' },
{ topicTitle: '[回贴195]刚报完志愿就被告知交大建筑系。。。',
topicUrl: 'http://tieba.baidu.com/p/4638151370' },
{ topicTitle: '[回贴183]一只自己作死的笨鸟',
topicUrl: 'http://tieba.baidu.com/p/4632687296' } ]

再获取贴吧前两页

两页有100贴,差不多是交大贴吧1天的产生量。两页的话对应两个网址,可以并发异步获取。常规的写法是自己维护一个计数器。先定义一个var count=0,然后每次获取成功以后,就count++。由于无法确定这两个异步操作到底谁先完成,那么每次获取成功后,就判断一下count==3。当值为真时,使用另一个函数继续完成操作。

代码如下:

var http = require('http');
var cheerio = require('cheerio');

var publicUrl = "http://tieba.baidu.com";
var indexUrl1 = "http://tieba.baidu.com/f?kw=%E4%B8%8A%E6%B5%B7%E4%BA%A4%E9%80%9A%E5%A4%A7%E5%AD%A6&ie=utf-8&pn=0";
var indexUrl2 = "http://tieba.baidu.com/f?kw=%E4%B8%8A%E6%B5%B7%E4%BA%A4%E9%80%9A%E5%A4%A7%E5%AD%A6&ie=utf-8&pn=50";
var replyNums = [];
var replyNumsTemp = [];
var topicTitles = [];
var topicUrls = [];
var top10Topics = [];
var count = 0;

http.get(indexUrl1, function(res) {
var source1 = "";
res.on('data', function(data) {
source1 += data;
});
res.on('end', function() {
var $ = cheerio.load(source1);
$("li.j_thread_list").each(function() {
var replyNum = $(this).children("div").children("div.j_threadlist_li_left").children("span.threadlist_rep_num").text();
var $topicA = $(this).children("div").children("div.j_threadlist_li_right").children("div.threadlist_lz").children("div.threadlist_title").children("a.j_th_tit");
var topicTitle = "[回贴" + replyNum + "]" + $topicA.text();
var topicUrl = publicUrl + $topicA.attr("href");
replyNums.push(replyNum);
replyNumsTemp.push(replyNum);
topicTitles.push(topicTitle);
topicUrls.push(topicUrl);
});
count++;
handle();
});
});

http.get(indexUrl2, function(res) {
var source2 = "";
res.on('data', function(data) {
source2 += data;
});
res.on('end', function() {
var $ = cheerio.load(source2);
$("li.j_thread_list").each(function() {
var replyNum = $(this).children("div").children("div.j_threadlist_li_left").children("span.threadlist_rep_num").text();
var $topicA = $(this).children("div").children("div.j_threadlist_li_right").children("div.threadlist_lz").children("div.threadlist_title").children("a.j_th_tit");
var topicTitle = "[回贴" + replyNum + "]" + $topicA.text();
var topicUrl = publicUrl + $topicA.attr("href");
replyNums.push(replyNum);
replyNumsTemp.push(replyNum);
topicTitles.push(topicTitle);
topicUrls.push(topicUrl);
});
count++;
handle();
});
});

function handle() {
if (count == 2) {

replyNumsTemp.sort(sortNumber);

for (i = 0; i <= 9; i++) {
var indexNum = replyNums.indexOf(replyNumsTemp[i]);
var top10Topic = {
title: topicTitles[indexNum],
url: topicUrls[indexNum]
};
top10Topics.push(top10Topic);
}
console.log(top10Topics);
}
}

function sortNumber(a, b) {
return b - a;
}

结果符合实际,说明没问题。

还有一种方法很容易想到,就是将两个获取方法嵌套,先获取第一页,成功后处理,再获取第二页,成功后再处理,并完成最后操作。但这两页的获取,明显互不依赖,完全可以并行,所以这种方法效率较低,并不建议。

那有没有比自己写个计数器更优雅的方法呢?当然,那就是用eventproxy,官方对其这样介绍道:

EventProxy is only a lightweight tool, which brings about a thinking change on event programming.

修改后的代码如下:

var http = require('http');
var cheerio = require('cheerio');
var EventProxy = require('eventproxy');

var publicUrl = "http://tieba.baidu.com";
var indexUrl1 = "http://tieba.baidu.com/f?kw=%E4%B8%8A%E6%B5%B7%E4%BA%A4%E9%80%9A%E5%A4%A7%E5%AD%A6&ie=utf-8&pn=0";
var indexUrl2 = "http://tieba.baidu.com/f?kw=%E4%B8%8A%E6%B5%B7%E4%BA%A4%E9%80%9A%E5%A4%A7%E5%AD%A6&ie=utf-8&pn=50";
var replyNums = [];
var replyNumsTemp = [];
var topicTitles = [];
var topicUrls = [];
var top10Topics = [];

var ep = new EventProxy();
ep.all('get1', 'get2', function(source1, source2) {
var source = source1 + source2;
var $ = cheerio.load(source);
$("li.j_thread_list").each(function() {
var replyNum = $(this).children("div").children("div.j_threadlist_li_left").children("span.threadlist_rep_num").text();
var $topicA = $(this).children("div").children("div.j_threadlist_li_right").children("div.threadlist_lz").children("div.threadlist_title").children("a.j_th_tit");
var topicTitle = "[回贴" + replyNum + "]" + $topicA.text();
var topicUrl = publicUrl + $topicA.attr("href");
replyNums.push(replyNum);
replyNumsTemp.push(replyNum);
topicTitles.push(topicTitle);
topicUrls.push(topicUrl);
});
handle();
});

http.get(indexUrl1, function(res) {
var source1 = "";
res.on('data', function(data) {
source1 += data;
});
res.on('end', function() {
ep.emit('get1', source1);
});
});

http.get(indexUrl2, function(res) {
var source2 = "";
res.on('data', function(data) {
source2 += data;
});
res.on('end', function() {
ep.emit('get2', source2);
});
});

function handle() {
replyNumsTemp.sort(sortNumber);

for (i = 0; i <= 9; i++) {
var indexNum = replyNums.indexOf(replyNumsTemp[i]);
var top10Topic = {
title: topicTitles[indexNum],
url: topicUrls[indexNum]
};
top10Topics.push(top10Topic);
}
console.log(top10Topics);
}

function sortNumber(a, b) {
return b - a;
}

结果和自己写计数器的一样,也没问题。

仅供学习交流,以上。