#引入库
import requests
from lxml import etree
#目标网址
url = 'https://www.85xscc.com/book/douluodalu1/1.html'
#循环
while True:
#设置请求头信息
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36'
}
#响应
res = requests.get(url, headers=headers)
#设置字符编码
res.encoding = 'utf-8'
# print(res.text)
#构造xpath解析对象
e = etree.HTML(res.text)
#xpath获取文章
info = '\n'.join(e.xpath('//div[@class="m-post"]/p/text()'))
#xpath获取标题
title = e.xpath('string(//h1/text())')
#下一章链接
url = 'https://www.85xscc.com' + e.xpath('//tr/td[2]/a/@href')[0]
# print(url)
#写入
with open('斗罗大陆.txt', 'a', encoding='utf-8') as f:
f.write(title + '\n\n' + info + '\n\n')
#判断关闭
if url == 'https://dl.131437.xyz/book/douluodalu1/':
break
感觉还有很多不懂得地方:
1.爬到哪里了?
2.如果不想放进一个记事本,如何分开放
3.代码优化