更新wz
This commit is contained in:
parent
dd7b4df73a
commit
04d9de88e8
@ -2,24 +2,29 @@ import requests
|
|||||||
from threading import Thread
|
from threading import Thread
|
||||||
from lxml import etree
|
from lxml import etree
|
||||||
import re
|
import re
|
||||||
|
import os
|
||||||
|
|
||||||
|
|
||||||
def get_html(url):
|
def get_html(url):
|
||||||
|
if not os.path.exists("wz"):
|
||||||
|
os.mkdir("wz")
|
||||||
resp = requests.get(url)
|
resp = requests.get(url)
|
||||||
resp.close()
|
resp.close()
|
||||||
html = etree.HTML(resp.text)
|
html = etree.HTML(resp.text)
|
||||||
title = html.xpath('/html/body/div[2]/div/main/div[1]/div/h1/text()')
|
title = html.xpath('/html/body/div[2]/div/main/div[1]/div/h1/text()')
|
||||||
content = html.xpath('/html/body/div[2]/div/main/article/div')
|
content = html.xpath('/html/body/div[2]/div/main/article/div')
|
||||||
title = ''.join(list(title[0]))
|
title = ''.join(list(title[0]))
|
||||||
file = open(f"wz\\{title}.txt", "a+")
|
file = open(f"wz\\{title}.txt", "a+", errors='ignore')
|
||||||
file.write(f"URL : {url}\n")
|
URL = (f"URL : {url}\n")
|
||||||
|
file.write(URL)
|
||||||
|
print(f"正在抓取{title}")
|
||||||
## 获取内容
|
## 获取内容
|
||||||
for i in content:
|
for i in content:
|
||||||
text = i.xpath('.//text()')
|
text = i.xpath('.//text()')
|
||||||
for word in text:
|
for word in text:
|
||||||
word_encode = word.encode('utf-8')
|
|
||||||
file.write(word)
|
file.write(word)
|
||||||
file.close()
|
file.close()
|
||||||
|
print(f"{title}抓取成功")
|
||||||
|
|
||||||
|
|
||||||
for index in range(1, 4):
|
for index in range(1, 4):
|
||||||
|
Loading…
Reference in New Issue
Block a user