practice_code/python/spider/blog/wz.py

43 lines
1.1 KiB
Python
Raw Permalink Normal View History

2024-03-27 17:08:25 +08:00
import requests
from threading import Thread
from lxml import etree
import re
2024-03-27 18:37:21 +08:00
import os
2024-03-27 17:08:25 +08:00
def get_html(url):
2024-03-27 18:37:21 +08:00
if not os.path.exists("wz"):
os.mkdir("wz")
2024-03-27 17:08:25 +08:00
resp = requests.get(url)
resp.close()
html = etree.HTML(resp.text)
title = html.xpath('/html/body/div[2]/div/main/div[1]/div/h1/text()')
content = html.xpath('/html/body/div[2]/div/main/article/div')
title = ''.join(list(title[0]))
2024-03-27 18:37:21 +08:00
file = open(f"wz\\{title}.txt", "a+", errors='ignore')
URL = (f"URL : {url}\n")
file.write(URL)
print(f"正在抓取{title}")
2024-03-27 17:08:25 +08:00
## 获取内容
for i in content:
text = i.xpath('.//text()')
for word in text:
file.write(word)
file.close()
2024-03-27 18:37:21 +08:00
print(f"{title}抓取成功")
2024-03-27 17:08:25 +08:00
for index in range(1, 4):
domain = f"https://wyc21.com/index.php/page/{index}"
resp = requests.get(domain)
resp.close()
html = etree.HTML(resp.text)
dev = html.xpath("/html/body/div[2]/div/main/div[1]/div")
urls = []
for item in dev:
url = item.xpath("./div[2]/div[1]/a/@href")
urls.append(url[0])
for url in urls:
get_html(url)