-
-
-
-
-
-
-
-
-
-
-
-
import
string
-
import
urllib2
-
import
re
-
-
-
class
HTML_Tool:
-
-
BgnCharToNoneRex = re.compile(
"(\t|\n| |
|
)"
)
-
-
-
EndCharToNoneRex = re.compile(
"<.>"
)
-
-
标签
-
BgnPartRex = re.compile(
"
"
)
-
CharToNewLineRex = re.compile(
"(
|
||
|
)"
)
-
CharToNextTabRex = re.compile(
"
"
)
-
-
-
replaceTab = [(
"
,
"
),(
">"
,
">"
),(
"&"
,
"&"
),(
"&"
,
"\""
),(
" "
,
" "
)]
-
-
def
Replace_Char(
self
,x):
-
x =
self
.BgnCharToNoneRex.sub("",x)
-
x =
self
.BgnPartRex.sub(
"\n "
,x)
-
x =
self
.CharToNewLineRex.sub(
"\n"
,x)
-
x =
self
.CharToNextTabRex.sub(
"\t"
,x)
-
x =
self
.EndCharToNoneRex.sub("",x)
-
-
for
t
in
self
.replaceTab:
-
x = x.replace(t[
0
],t[
1
])
-
return
x
-
-
class
Baidu_Spider:
-
-
def
__init__(
self
,url):
-
self
.myUrl = url +
'?see_lz=1'
-
self
.datas = []
-
self
.myTool = HTML_Tool()
-
print
u
'已经启动百度贴吧爬虫,咔嚓咔嚓'
-
-
-
def
baidu_tieba(
self
):
-
-
myPage = urllib2.urlopen(
self
.myUrl).read().decode(
"gbk"
)
-
-
endPage =
self
.page_counter(myPage)
-
-
title =
self
.find_title(myPage)
-
print
u
'文章名称:'
+ title
-
-
self
.save_data(
self
.myUrl,title,endPage)
-
-
-
def
page_counter(
self
,myPage):
-
-
myMatch = re.search(r
'class="red">(\d+?)
'
, myPage, re.S)
-
if
myMatch:
-
endPage = int(myMatch.group(
1
))
-
print
u
'爬虫报告:发现楼主共有%d页的原创内容'
% endPage
-
else
:
-
endPage =
0
-
print
u
'爬虫报告:无法计算楼主发布内容有多少页!'
-
return
endPage
-
-
-
def
find_title(
self
,myPage):
-
xxxxxxxxxx
找出标题
-
myMatch = re.search(r
'
(.*?)'
, myPage, re.S)
-
title = u
'暂无标题'
-
if
myMatch:
-
title = myMatch.group(
1
)
-
else
:
-
print
u
'爬虫报告:无法加载文章标题!'
-
-
title = title.replace(
'\\','
').replace('
/
','
').replace('
:
','
').replace('
*
','
').replace('
?
','
').replace('
"
','
').replace('
>
','
').replace('
<
','
').replace('
|
','
')
-
return
title
-
-
-
-
def
save_data(
self
,url,title,endPage):
-
-
self
.get_data(url,endPage)
-
-
f = open(title+
'.txt'
,
'w+'
)
-
f.writelines(
self
.datas)
-
f.close()
-
print
u
'爬虫报告:文件已下载到本地并打包成txt文件'
-
print
u
'请按任意键退出...'
-
raw_input();
-
-
-
def
get_data(
self
,url,endPage):
-
url = url +
'&pn='
-
for
i
in
range(
1
,endPage+
1
):
-
print
u
'爬虫报告:爬虫%d号正在加载中...'
% i
-
myPage = urllib2.urlopen(url + str(i)).read()
-
-
self
.deal_data(myPage.decode(
'gbk'
))
-
-
-
-
def
deal_data(
self
,myPage):
-
myItems = re.findall(
'id="post_content.*?>(.*?)'
,myPage,re.S)
-
for
item
in
myItems:
-
data =
self
.myTool.Replace_Char(item.replace(
"\n"
,"").encode(
'gbk'
))
-
self
.datas.append(data+
'\n'
)
-
-
-
-
-
print
u
-
-
-
-
-
-
-
-
-
-
-
-
-
-
print
u
'请输入贴吧的地址最后的数字串:'
-
bdurl =
'http://tieba.baidu.com/p/'
+ str(raw_input(u
'http://tieba.baidu.com/p/'
))
-
-
-
mySpider = Baidu_Spider(bdurl)
-
mySpider.baidu_tieba()
今日值班:
马哥教育三号女神学习顾问,暖心之处甜过初恋,脑洞大开会LOL神迹,可正经可搞笑,常陪伴0基础学员高效成长,深夜凌晨常有她与你的陪伴,据说她服务的学员是成长最快的。
内容沟通、职业成长和课程学习,可添加学习顾问:
【扫一扫即可撩】
特别提醒
3.15真诚大回馈,全部课程
限时优惠:
Linux面授24期班:前20名优惠
400元
,仅剩2个席位
添加课程顾问即可获取优惠详情!