practice_code/python/spider/blog/wz.py

import requests
from threading import Thread
from lxml import etree
import re
import os


def get_html(url):
    if not os.path.exists("wz"):
        os.mkdir("wz")
    resp = requests.get(url)
    resp.close()
    html = etree.HTML(resp.text)
    title = html.xpath('/html/body/div[2]/div/main/div[1]/div/h1/text()')
    content = html.xpath('/html/body/div[2]/div/main/article/div')
    title = ''.join(list(title[0]))
    file = open(f"wz\\{title}.txt", "a+", errors='ignore')
    URL = (f"URL : {url}\n")
    file.write(URL)
    print(f"正在抓取{title}")
    ## 获取内容
    for i in content:
        text = i.xpath('.//text()')
        for word in text:
            file.write(word)
    file.close()
    print(f"{title}抓取成功")


for index in range(1, 4):
    domain = f"https://wyc21.com/index.php/page/{index}"
    resp = requests.get(domain)
    resp.close()
    html = etree.HTML(resp.text)
    dev = html.xpath("/html/body/div[2]/div/main/div[1]/div")
    urls = []
    for item in dev:
        url = item.xpath("./div[2]/div[1]/a/@href")
        urls.append(url[0])

    for url in urls:
        get_html(url)
wz博客文章 2024-03-27 17:08:25 +08:00			`import requests`
			`from threading import Thread`
			`from lxml import etree`
			`import re`
更新wz 2024-03-27 18:37:21 +08:00			`import os`
wz博客文章 2024-03-27 17:08:25 +08:00

			`def get_html(url):`
更新wz 2024-03-27 18:37:21 +08:00			`if not os.path.exists("wz"):`
			`os.mkdir("wz")`
wz博客文章 2024-03-27 17:08:25 +08:00			`resp = requests.get(url)`
			`resp.close()`
			`html = etree.HTML(resp.text)`
			`title = html.xpath('/html/body/div[2]/div/main/div[1]/div/h1/text()')`
			`content = html.xpath('/html/body/div[2]/div/main/article/div')`
			`title = ''.join(list(title[0]))`
更新wz 2024-03-27 18:37:21 +08:00			`file = open(f"wz\\{title}.txt", "a+", errors='ignore')`
			`URL = (f"URL : {url}\n")`
			`file.write(URL)`
			`print(f"正在抓取{title}")`
wz博客文章 2024-03-27 17:08:25 +08:00			`## 获取内容`
			`for i in content:`
			`text = i.xpath('.//text()')`
			`for word in text:`
			`file.write(word)`
			`file.close()`
更新wz 2024-03-27 18:37:21 +08:00			`print(f"{title}抓取成功")`
wz博客文章 2024-03-27 17:08:25 +08:00

			`for index in range(1, 4):`
			`domain = f"https://wyc21.com/index.php/page/{index}"`
			`resp = requests.get(domain)`
			`resp.close()`
			`html = etree.HTML(resp.text)`
			`dev = html.xpath("/html/body/div[2]/div/main/div[1]/div")`
			`urls = []`
			`for item in dev:`
			`url = item.xpath("./div[2]/div[1]/a/@href")`
			`urls.append(url[0])`

			`for url in urls:`
			`get_html(url)`