From 67b20ba291bc3f10d07231fca06dfff7c4316d66 Mon Sep 17 00:00:00 2001
From: lsy2246 <lsy200546@hotmail.com>
Date: Sun, 24 Mar 2024 02:22:29 +0800
Subject: [PATCH] =?UTF-8?q?=E6=96=B0=E5=A2=9E=E7=88=AC=E8=99=AB(=E8=B1=86?=
 =?UTF-8?q?=E7=93=A3)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 python/spider/douban.py | 35 +++++++++++++++++++++++++++++++++++
 python/test/test.py     | 31 +++++++++++++++++++++++++++++++
 2 files changed, 66 insertions(+)
 create mode 100644 python/spider/douban.py
 create mode 100644 python/test/test.py
diff --git a/python/spider/douban.py b/python/spider/douban.py
new file mode 100644
index 0000000..c3690a9
--- /dev/null
+++ b/python/spider/douban.py
@@ -0,0 +1,35 @@
+import re
+
+import requests
+
+i = 0  # 控制页数的遍历
+
+url = "https://movie.douban.com/top250"
+
+UA = {"User-Agent":
+          "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36"}
+
+douban = [] #储存数据
+
+##预订正则表达式
+obj = re.compile(r'<li>.*?<span class="title">(?P<title>.*?)</span>.*?'
+                 r' <span class="playable">\[(?P<playable>.*?)\]</span>.*?'
+                 r'<p class="">.*?导演: (?P<name>[\u4e00-\u9fa5]+).*?&nbsp;&nbsp;&nbsp;.*?'
+                 r'<br>.*?(?P<time>\d+)',re.S)
+
+##循环获取数据
+for i in range(0,250,25):
+
+    param = {"start": i,
+             "filter": ""
+             }
+
+    resp = requests.get(url, headers=UA, params=param)
+    sj = obj.findall(resp.text)
+    douban.append(sj)
+    resp.close()
+
+##输出
+for i in range(0,10):
+    for j in douban[i]:
+        print(j)
diff --git a/python/test/test.py b/python/test/test.py
new file mode 100644
index 0000000..5ae5e99
--- /dev/null
+++ b/python/test/test.py
@@ -0,0 +1,31 @@
+import re
+
+import requests
+
+i = 0  # 控制页数的遍历
+
+url = "https://movie.douban.com/top250"
+
+UA = {"User-Agent":
+          "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36"}
+
+douban = [] #储存数据
+
+
+obj = re.compile(r'<li>.*?<span class="title">(?P<title>.*?)</span>.*?'
+                 r' <span class="playable">\[(?P<playable>.*?)\]</span>.*?'
+                 r'<p class="">.*?导演: (?P<name>[\u4e00-\u9fa5]+).*?&nbsp;&nbsp;&nbsp;.*?'
+                 r'<br>.*?(?P<time>\d+)',re.S)
+
+
+
+
+param = {"start": i,
+             "filter": ""
+             }
+
+resp = requests.get(url, headers=UA, params=param)
+sj = obj.findall(resp.text)
+resp.close()
+
+print(type(sj))
\ No newline at end of file