diff --git a/python/spider/blog/wz.py b/python/spider/blog/wz.py new file mode 100644 index 0000000..e17cb9b --- /dev/null +++ b/python/spider/blog/wz.py @@ -0,0 +1,37 @@ +import requests +from threading import Thread +from lxml import etree +import re + + +def get_html(url): + resp = requests.get(url) + resp.close() + html = etree.HTML(resp.text) + title = html.xpath('/html/body/div[2]/div/main/div[1]/div/h1/text()') + content = html.xpath('/html/body/div[2]/div/main/article/div') + title = ''.join(list(title[0])) + file = open(f"wz\\{title}.txt", "a+") + file.write(f"URL : {url}\n") + ## 获取内容 + for i in content: + text = i.xpath('.//text()') + for word in text: + word_encode = word.encode('utf-8') + file.write(word) + file.close() + + +for index in range(1, 4): + domain = f"https://wyc21.com/index.php/page/{index}" + resp = requests.get(domain) + resp.close() + html = etree.HTML(resp.text) + dev = html.xpath("/html/body/div[2]/div/main/div[1]/div") + urls = [] + for item in dev: + url = item.xpath("./div[2]/div[1]/a/@href") + urls.append(url[0]) + + for url in urls: + get_html(url)