From 67b20ba291bc3f10d07231fca06dfff7c4316d66 Mon Sep 17 00:00:00 2001 From: lsy2246 Date: Sun, 24 Mar 2024 02:22:29 +0800 Subject: [PATCH] =?UTF-8?q?=E6=96=B0=E5=A2=9E=E7=88=AC=E8=99=AB(=E8=B1=86?= =?UTF-8?q?=E7=93=A3)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- python/spider/douban.py | 35 +++++++++++++++++++++++++++++++++++ python/test/test.py | 31 +++++++++++++++++++++++++++++++ 2 files changed, 66 insertions(+) create mode 100644 python/spider/douban.py create mode 100644 python/test/test.py diff --git a/python/spider/douban.py b/python/spider/douban.py new file mode 100644 index 0000000..c3690a9 --- /dev/null +++ b/python/spider/douban.py @@ -0,0 +1,35 @@ +import re + +import requests + +i = 0 # 控制页数的遍历 + +url = "https://movie.douban.com/top250" + +UA = {"User-Agent": + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36"} + +douban = [] #储存数据 + +##预订正则表达式 +obj = re.compile(r'
  • .*?(?P.*?)</span>.*?' + r' <span class="playable">\[(?P<playable>.*?)\]</span>.*?' + r'<p class="">.*?导演: (?P<name>[\u4e00-\u9fa5]+).*?   .*?' + r'<br>.*?(?P<time>\d+)',re.S) + +##循环获取数据 +for i in range(0,250,25): + + param = {"start": i, + "filter": "" + } + + resp = requests.get(url, headers=UA, params=param) + sj = obj.findall(resp.text) + douban.append(sj) + resp.close() + +##输出 +for i in range(0,10): + for j in douban[i]: + print(j) diff --git a/python/test/test.py b/python/test/test.py new file mode 100644 index 0000000..5ae5e99 --- /dev/null +++ b/python/test/test.py @@ -0,0 +1,31 @@ +import re + +import requests + +i = 0 # 控制页数的遍历 + +url = "https://movie.douban.com/top250" + +UA = {"User-Agent": + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36"} + +douban = [] #储存数据 + + +obj = re.compile(r'<li>.*?<span class="title">(?P<title>.*?)</span>.*?' + r' <span class="playable">\[(?P<playable>.*?)\]</span>.*?' + r'<p class="">.*?导演: (?P<name>[\u4e00-\u9fa5]+).*?   .*?' + r'<br>.*?(?P<time>\d+)',re.S) + + + + +param = {"start": i, + "filter": "" + } + +resp = requests.get(url, headers=UA, params=param) +sj = obj.findall(resp.text) +resp.close() + +print(type(sj)) \ No newline at end of file