wz博客文章

2024-03-27 17:08:25 +08:00 · 2024-03-27 17:08:25 +08:00 · dd7b4df73a
commit dd7b4df73a
parent aef2ea236f
1 changed files with 37 additions and 0 deletions
--- a/python/spider/blog/wz.py
+++ b/python/spider/blog/wz.py
@ -0,0 +1,37 @@
+import requests
+from threading import Thread
+from lxml import etree
+import re
+
+
+def get_html(url):
+    resp = requests.get(url)
+    resp.close()
+    html = etree.HTML(resp.text)
+    title = html.xpath('/html/body/div[2]/div/main/div[1]/div/h1/text()')
+    content = html.xpath('/html/body/div[2]/div/main/article/div')
+    title = ''.join(list(title[0]))
+    file = open(f"wz\\{title}.txt", "a+")
+    file.write(f"URL : {url}\n")
+    ## 获取内容
+    for i in content:
+        text = i.xpath('.//text()')
+        for word in text:
+            word_encode = word.encode('utf-8')
+            file.write(word)
+    file.close()
+
+
+for index in range(1, 4):
+    domain = f"https://wyc21.com/index.php/page/{index}"
+    resp = requests.get(domain)
+    resp.close()
+    html = etree.HTML(resp.text)
+    dev = html.xpath("/html/body/div[2]/div/main/div[1]/div")
+    urls = []
+    for item in dev:
+        url = item.xpath("./div[2]/div[1]/a/@href")
+        urls.append(url[0])
+
+    for url in urls:
+        get_html(url)