diff --git a/python/spider/blog/wz.py b/python/spider/blog/wz.py index e17cb9b..3603117 100644 --- a/python/spider/blog/wz.py +++ b/python/spider/blog/wz.py @@ -2,24 +2,29 @@ import requests from threading import Thread from lxml import etree import re +import os def get_html(url): + if not os.path.exists("wz"): + os.mkdir("wz") resp = requests.get(url) resp.close() html = etree.HTML(resp.text) title = html.xpath('/html/body/div[2]/div/main/div[1]/div/h1/text()') content = html.xpath('/html/body/div[2]/div/main/article/div') title = ''.join(list(title[0])) - file = open(f"wz\\{title}.txt", "a+") - file.write(f"URL : {url}\n") + file = open(f"wz\\{title}.txt", "a+", errors='ignore') + URL = (f"URL : {url}\n") + file.write(URL) + print(f"正在抓取{title}") ## 获取内容 for i in content: text = i.xpath('.//text()') for word in text: - word_encode = word.encode('utf-8') file.write(word) file.close() + print(f"{title}抓取成功") for index in range(1, 4):