From 9405eb5cf7048e0605cc3c0bbefa1f1de8a180a4 Mon Sep 17 00:00:00 2001 From: lsy Date: Sun, 3 Nov 2024 11:24:40 +0800 Subject: [PATCH] =?UTF-8?q?=E8=B1=86=E7=93=A3=E8=AF=BB=E4=B9=A6top250?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- python/spider/book/douban.py | 39 ++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) create mode 100644 python/spider/book/douban.py diff --git a/python/spider/book/douban.py b/python/spider/book/douban.py new file mode 100644 index 0000000..3fd5174 --- /dev/null +++ b/python/spider/book/douban.py @@ -0,0 +1,39 @@ +import requests +from bs4 import BeautifulSoup +import pandas as pd +def spider(url): + headers={"user-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36"} + respond=requests.get(url, headers=headers) + + soup=BeautifulSoup(respond.text,'html.parser') + + data=[] + for item in soup.select('.item'): + href = item.select_one('.nbg')['href'] + img = item.select_one('.nbg img')['src'] + name_filed = item.select_one('.pl2') + title = name_filed.find('a')['title'] + state = name_filed.find('img') + if ( state is not None): + state = name_filed.find('img')['src'] + author = item.select_one('.pl').text.split('/')[0] + grade = item.select_one('.rating_nums').text + + data.append({ + 'href':href, + 'img':img, + 'title':title, + 'state':state, + 'author':author, + 'grade':grade + }) + return data +data = [] +for i in range(11): + url=f"https://book.douban.com/top250?start={25*i}" + info=spider(url) + data+=info +df = pd.DataFrame(data) +new_df = df.dropna(subset=['state']) + +print(new_df.state)