From 04d9de88e83d38c804b6940917e2ee3c7caaac86 Mon Sep 17 00:00:00 2001 From: lsy2246 Date: Wed, 27 Mar 2024 18:37:21 +0800 Subject: [PATCH] =?UTF-8?q?=E6=9B=B4=E6=96=B0wz?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- python/spider/blog/wz.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/python/spider/blog/wz.py b/python/spider/blog/wz.py index e17cb9b..3603117 100644 --- a/python/spider/blog/wz.py +++ b/python/spider/blog/wz.py @@ -2,24 +2,29 @@ import requests from threading import Thread from lxml import etree import re +import os def get_html(url): + if not os.path.exists("wz"): + os.mkdir("wz") resp = requests.get(url) resp.close() html = etree.HTML(resp.text) title = html.xpath('/html/body/div[2]/div/main/div[1]/div/h1/text()') content = html.xpath('/html/body/div[2]/div/main/article/div') title = ''.join(list(title[0])) - file = open(f"wz\\{title}.txt", "a+") - file.write(f"URL : {url}\n") + file = open(f"wz\\{title}.txt", "a+", errors='ignore') + URL = (f"URL : {url}\n") + file.write(URL) + print(f"正在抓取{title}") ## 获取内容 for i in content: text = i.xpath('.//text()') for word in text: - word_encode = word.encode('utf-8') file.write(word) file.close() + print(f"{title}抓取成功") for index in range(1, 4):