wz博客文章
This commit is contained in:
parent
aef2ea236f
commit
dd7b4df73a
37
python/spider/blog/wz.py
Normal file
37
python/spider/blog/wz.py
Normal file
@ -0,0 +1,37 @@
|
||||
import requests
|
||||
from threading import Thread
|
||||
from lxml import etree
|
||||
import re
|
||||
|
||||
|
||||
def get_html(url):
|
||||
resp = requests.get(url)
|
||||
resp.close()
|
||||
html = etree.HTML(resp.text)
|
||||
title = html.xpath('/html/body/div[2]/div/main/div[1]/div/h1/text()')
|
||||
content = html.xpath('/html/body/div[2]/div/main/article/div')
|
||||
title = ''.join(list(title[0]))
|
||||
file = open(f"wz\\{title}.txt", "a+")
|
||||
file.write(f"URL : {url}\n")
|
||||
## 获取内容
|
||||
for i in content:
|
||||
text = i.xpath('.//text()')
|
||||
for word in text:
|
||||
word_encode = word.encode('utf-8')
|
||||
file.write(word)
|
||||
file.close()
|
||||
|
||||
|
||||
for index in range(1, 4):
|
||||
domain = f"https://wyc21.com/index.php/page/{index}"
|
||||
resp = requests.get(domain)
|
||||
resp.close()
|
||||
html = etree.HTML(resp.text)
|
||||
dev = html.xpath("/html/body/div[2]/div/main/div[1]/div")
|
||||
urls = []
|
||||
for item in dev:
|
||||
url = item.xpath("./div[2]/div[1]/a/@href")
|
||||
urls.append(url[0])
|
||||
|
||||
for url in urls:
|
||||
get_html(url)
|
Loading…
Reference in New Issue
Block a user