第二次获取资源404,疑似玄学bug

2024-03-30 00:26:07 +08:00 · 2024-03-30 00:26:07 +08:00 · 21a0961d56
commit 21a0961d56
parent 63859b11a4
2 changed files with 169 additions and 31 deletions
--- a/python/spider/Movie/kanjuba.net.py
+++ b/python/spider/Movie/kanjuba.net.py
@ -0,0 +1,169 @@
+from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor  # 多线程/多进程
+import requests
+import re
+import os
+import shutil
+from lxml import etree
+
+
+def downloadm3u81(domain, filepath, ts, headers):
+    domain = ts.rstrip("\n")
+    urlend = domain.split('/')[-1]
+    domain = domain.replace(urlend, ts)
+    urlend = domain.split('/')[-1]
+    filename = filepath + urlend
+    with requests.get(domain, headers=headers) as r:
+        if r.status_code == 200:
+            with open(filename, 'wb') as f:
+                f.write(r.content)
+            try:
+                print(f"正在下载{urlend}")
+                open(filename, "r")
+            except:
+                print(f"{urlend}下载失败尝试重新下载")
+                try:
+                    with open(filename, 'wb') as f:
+                        print(f"正在下载{urlend}")
+                        f.write(r.content)
+                except:
+                    print(f"{urlend}下载失败")
+            else:
+                print(f"{urlend}下载完成")
+        else:
+            print(f"{domain}访问失败,错误代码{r.status_code}")
+
+
+def merge_ts_to_mp41(filepath, title):
+    with open(f"{filepath}{title}.m3u8", "r") as file:
+        with open(f"{title}.mp4", "ab") as video:
+            print("正在合成,请耐心等待")
+            for name in file:
+                name = name.split("/")[-1]
+                name = name.strip()  # 去除文件名中的换行符和空白
+                name = filepath + name
+                with open(name, "rb") as ts:
+                    video.write(ts.read())
+
+
+def merge_ts_to_mp42(filepath, title):
+    with open(f"{filepath}{title}.m3u8", "r") as file:
+        with open(f"{title}.mp4", "ab") as video:
+            print("正在合成,请耐心等待")
+            for name in file:
+                name = name.strip()  # 去除文件名中的换行符和空白
+                name = filepath + name
+                print(name)
+
+
+def downloadm3u82(filepath, domain,ts, headers):
+    urlend = domain.split("/")[-1]
+    filename = filepath + ts
+    domain = domain.replace(urlend, ts)
+    with requests.get(domain, headers=headers) as r:
+        if r.status_code == 200:
+            with open(filename, 'wb') as f:
+                print(f"正在下载{urlend}")
+                f.write(r.content)
+            try:
+                open(filename, "r")
+            except:
+                print(f"{ts}下载失败尝试重新下载")
+                try:
+                    with open(filename, 'wb') as f:
+                        print(f"正在下载{urlend}")
+                        f.write(r.content)
+                except:
+                    print(f"{ts}下载失败")
+            else:
+                print(f"{ts}下载完成")
+        else:
+            print(f"{domain}访问失败,错误代码{r.status_code}")
+
+##获取m3u8文件
+def getm3u8(domain, headers):
+    ##获取m3u8
+    orl_obj = re.compile('http.*?.m3u8')
+    with requests.get(domain, headers=headers) as html:
+        if html.status_code == 200:
+            title = etree.HTML(html.text)
+            title = title.xpath('/html/body/div/div[2]/div/div[2]/div/div/div/div[1]/div[1]/h2/text()')
+            title = title[0]
+
+            if os.name == 'posix':
+                filepath = f"tmpMovie/{title}/"
+            elif os.name == 'nt':
+                filepath = f"tmpMovie\\{title}\\"
+
+            if not os.path.exists("tmpMovie"):
+                os.mkdir("tmpMovie")
+            if not os.path.exists(filepath):
+                os.mkdir(filepath)
+            print(f'临时文件夹“{filepath}”创建成功')
+
+            print("开始读取m3u8文件")
+            m3u8url1 = orl_obj.findall(html.text)
+            domain = m3u8url1[0]
+
+            with requests.get(m3u8url1[0], headers=headers) as m3u8url2:
+                m3u8url2 = str(m3u8url2.text)
+                m3u8url2 = m3u8url2.split('\n')
+                if len(m3u8url2) < 5:
+                    if m3u8url2[-1] == "":
+                        m3u8url2 = m3u8url2[-2]
+                    else:
+                        m3u8url2 = m3u8url2[-1]
+                    m3u8url2 = m3u8url2.split()
+                    substitute = domain.split("/")[-1]
+
+
+                    domain = domain.replace(substitute, m3u8url2[0])
+                    with requests.get(domain, headers=headers) as tss:
+                        tss = str(tss.text).split("\n")
+                        with open(f"{filepath}{title}.m3u8", "a+") as m3u8file:
+                            for ts in tss:
+                                if ts.startswith("#") or ts.startswith(" "):
+                                    continue
+                                else:
+                                    m3u8file.write(f"{ts}\n")
+                    print("m3u8文件读取完成")
+                else:
+                    with open(f"{filepath}{title}.m3u8", "a+") as m3u8file:
+                        for ts in m3u8url2:
+                            if ts.startswith("#") or ts.startswith(" "):
+                                continue
+                            else:
+                                m3u8file.write(f"{ts}\n")
+                print("m3u8文件读取完成")
+        else:
+            print("访问失败")
+            exit(1)
+
+    ##下载m3u8
+    print(f'开始下载视频"{title}"')
+    with open(f"{filepath}{title}.m3u8", "r") as tsfile:
+        with ThreadPoolExecutor(max_workers=1) as xc:
+            for ts in tsfile:
+                if ts.startswith("http"):
+                    xc.submit(downloadm3u81, domain=domain, filepath=filepath, ts=ts, headers=headers)
+                else:
+                    xc.submit(downloadm3u82, filepath=filepath, domain=domain, ts=ts, headers=headers)
+
+    ##合成视频
+    print("开始合成视频")
+    merge_ts_to_mp42(filepath, title)
+    shutil.rmtree("tmpMovie")
+    print("下载已经全部完成")
+    print(f"文件在当前根目录{title}.mp4")
+
+
+if __name__ == "__main__":
+    domain = "https://kanjuba.net/play/93312-0-9.html"
+
+    headers = {
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36',
+        'Referer': "https://kanjuba.net/",
+        'Sec-Ch-Ua': '"Google Chrome";v="123", "Not:A-Brand";v="8", "Chromium";v="123"',
+        'Sec-Ch-Ua-Platform': '"Windows"'
+    }
+
+    getm3u8(domain, headers)
--- a/python/test/test.py
+++ b/python/test/test.py
@ -1,31 +0,0 @@
-import re
-
-import requests
-
-i = 0  # 控制页数的遍历
-
-url = "https://movie.douban.com/top250"
-
-UA = {"User-Agent":
-          "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36"}
-
-douban = [] #储存数据
-
-
-obj = re.compile(r'<li>.*?<span class="title">(?P<title>.*?)</span>.*?'
-                 r' <span class="playable">\[(?P<playable>.*?)\]</span>.*?'
-                 r'<p class="">.*?导演: (?P<name>[\u4e00-\u9fa5]+).*?&nbsp;&nbsp;&nbsp;.*?'
-                 r'<br>.*?(?P<time>\d+)',re.S)
-
-
-
-
-param = {"start": i,
-             "filter": ""
-             }
-
-resp = requests.get(url, headers=UA, params=param)
-sj = obj.findall(resp.text)
-resp.close()
-
-print(type(sj))