小说爬虫

This commit is contained in:
lsy2246 2024-03-29 00:16:12 +08:00
parent 8f5539fc74
commit 63859b11a4

View File

@ -0,0 +1,47 @@
import asyncio
import re
import aiohttp
import aiofiles
from bs4 import BeautifulSoup
import os
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor # 多线程/多进程
async def aiodownload(url):
if not os.path.exists("xs"):
os.mkdir("xs")
async with aiohttp.ClientSession() as session:
async with session.get(url) as response:
html = BeautifulSoup(await response.read(), "html.parser")
title = html.find("h1", class_="wap_none").text
content = html.find("div", id="chaptercontent").text
if os.name == 'posix':
name = "xs\\" + title.split(" ")[0] + ".txt"
elif os.name == 'nt':
name = "xs/" + title.split(" ")[0] + ".txt"
content = content.replace("  ", "\n").replace(
"请收藏本站https://www.biqg.cc。笔趣阁手机版https://m.biqg.cc ", "").replace("『点此报错』『加入书签』",
"")
async with aiofiles.open(name, "a+") as file:
await file.write(content)
print(title, "成功下载")
async def main(urls):
takes = []
for url in urls:
take = asyncio.create_task(aiodownload(url))
takes.append(take)
await asyncio.wait(takes)
if __name__ == "__main__":
urls = []
for index in range(1, 2):
url = f"https://www.biqg.cc/book/3670/{index}.html"
urls.append(url)
asyncio.run(main(urls=urls))
print("全部下载完成")