import requests
from lxml import etree
def content_t(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36'}
resp = requests.get(url, headers=headers)
e = etree.HTML(resp.text)
return e
def login(i):
url=f"http://10.3.0.118/forum.php?mod=forumdisplay&fid=2&page={i}"
e=content_t(url)
print(f"所爬取的页数"+str(i))
return e
#爬取tbody下的代码
def spirdein_page(e,j):
#帖子id
title=e.xpath(f'//table[@summary="forum_2"]/tbody[{j}]/tr/th/a[2]/text()')
href='http://10.3.0.118/'+e.xpath(f'//table[@summary="forum_2"]/tbody[{j}]/tr/th/a[2]/@href')[0]
print(f'第{j}个帖子的ID:{href}')
return href, title
def jiexi(href,tida):
url=href
e = content_t(url)
#帖子中的帖子
changdu = e.xpath('//div[@id="postlist"]/div')
title=tida
try:
tid = e.xpath(f'//div[@id="postlist"]/div[1]/@id')[0]
author = e.xpath(f'//div[@class="pl bm"]/div[1]/table[1]/tr/td/div/div/div/a/text()')[0]
fen = e.xpath(f'//div[@class="pl bm"]/div[1]/table[1]/tr/td/div/div[2]/div[2]/dl/dd/a/text()')[0]
leval = e.xpath(f'//div[@class="pl bm"]/div[1]/table[1]/tr/td/div/p/em/a/text()')[0]
# 列表生成式去除\n,和空格问题
content = [x.strip() for x in
e.xpath(f'//div[@class="pl bm"]/div[1]/table[1]/tr/td/div[2]/div/div/table/tr/td/text()') if
len(x.strip()) != 0]
content = ''.join(content)
print(title, tid, author, fen, leval, content)
save(tid, author, fen, leval, title, content)
except:
print("已经爬完总页数:" )
def save(tid,author,score,grade,title,content):
with open('data.csv','a+',newline='',encoding='utf-8') as file:
file.write(str(tid)+','+str(author)+','+str(score)+','+str(grade)+','+str(title)+','+str(content)+'\n')
def spider_page():
for i in range(1,294):
e=login(i)
for j in range(2, 22):
ht=spirdein_page(e,j)
jiexi(ht[0],ht[1])
#print(ht[0])
if __name__ == '__main__':
spider_page() |