小说爬虫

2024-03-29 00:16:12 +08:00 · 2024-03-29 00:16:12 +08:00 · 63859b11a4
commit 63859b11a4
parent 8f5539fc74
1 changed files with 47 additions and 0 deletions
--- a/python/spider/book/biqg.cc.py
+++ b/python/spider/book/biqg.cc.py
@ -0,0 +1,47 @@
+import asyncio
+import re
+
+import aiohttp
+import aiofiles
+from bs4 import BeautifulSoup
+import os
+from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor  # 多线程/多进程
+
+
+async def aiodownload(url):
+    if not os.path.exists("xs"):
+        os.mkdir("xs")
+    async with aiohttp.ClientSession() as session:
+        async with session.get(url) as response:
+            html = BeautifulSoup(await response.read(), "html.parser")
+            title = html.find("h1", class_="wap_none").text
+            content = html.find("div", id="chaptercontent").text
+            if os.name == 'posix':
+                name = "xs\\" + title.split(" ")[0] + ".txt"
+            elif os.name == 'nt':
+                name = "xs/" + title.split(" ")[0] + ".txt"
+            content = content.replace("　　", "\n").replace(
+                "请收藏本站：https://www.biqg.cc。笔趣阁手机版：https://m.biqg.cc ", "").replace("『点此报错』『加入书签』",
+                                                                                              "")
+            async with aiofiles.open(name, "a+") as file:
+                await file.write(content)
+
+    print(title, "成功下载")
+
+
+async def main(urls):
+    takes = []
+    for url in urls:
+        take = asyncio.create_task(aiodownload(url))
+        takes.append(take)
+    await asyncio.wait(takes)
+
+
+if __name__ == "__main__":
+    urls = []
+    for index in range(1, 2):
+        url = f"https://www.biqg.cc/book/3670/{index}.html"
+        urls.append(url)
+    asyncio.run(main(urls=urls))
+    print("全部下载完成")
+