梨视频,电影天堂,22图床的爬虫

2024-03-24 23:20:16 +08:00 · 2024-03-24 23:20:16 +08:00 · aef2ea236f
commit aef2ea236f
parent 67b20ba291
4 changed files with 136 additions and 0 deletions
--- a/python/spider/Movie/22tc.py
+++ b/python/spider/Movie/22tc.py
@ -0,0 +1,48 @@
 import re  # 正则表达式
 import requests  # 获取网页
 from bs4 import BeautifulSoup
 ##拿到发现图片页面信息
 domian = "https://tc.lsy22.com/explore"
 i = 1
 UA = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36"}
 param = {"page": i,
         "seek": "2023-10-20+13%3A40%3A23.5pOu"}
 resp = requests.get(domian, headers=UA)
 page = BeautifulSoup(resp.text, "html.parser")
 list = page.find_all("div", class_="list-item fixed-size c8 gutter-margin-right-bottom")
 resp.close()
 href = []
 ##获取子页面
 for item in list:
    href.append(item.find("div", class_="list-item-image fixed-size").find("a").get("href"))
 j = 0
 ## 下载图片
 for i in range(len(list)):
    j += 1
    ##获取链接
    resp = requests.get(href[i], headers=UA)
    page = BeautifulSoup(resp.text, "html.parser")
    image = page.find("div", class_="content-width margin-top-10").find("div",
                                                                        class_="header header-content margin-bottom-10").find(
        "div", class_="header-content-right").find("a").get("href")
    ##下载图片
    img_resp = requests.get(image, headers=UA)
    img_name = r"image\\"+str(j)+"."+image.split(".")[-1]
    img_file =open(img_name, "wb")
    img_file.write(img_resp.content)
    ##关闭下载
    print("成功",j,"次")
    img_file.close()
    img_file.close()
    resp.close()
--- a/python/spider/Movie/dianyingtiantang.py
+++ b/python/spider/Movie/dianyingtiantang.py
@ -0,0 +1,56 @@
 import re  # 正则表达式
 import requests  # 获取网页
 import warnings
 warnings.filterwarnings("ignore")#去除443警告
 file = open("电影天堂.txt","a+",encoding="utf-8")
 index = 1
 domain = "https://www.dyttcn.com"
 UA = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36"}
 rules1 = re.compile(r'<a href="(?P<url>.*?.html)" class="ulink" title="\d+年.*?片《.*?》.*?">.*?</a>')
 data1 = []
 for index in range(1, 16):
    class_typical = f"/jingdiandapian/list_18_{index}.html"
    resp1 = requests.get(domain + class_typical, headers=UA, verify=False)
    resp1.close()
    resp1.encoding = "gb2312"
    data1.append(rules1.findall(resp1.text))
 rules2 = re.compile(r'.*?◎片　　名　(?P<name>.*?)</p>'
                    r'.*?◎年　　代　(?P<year>\d+)</p>'
                    r'.*?◎类　　别　(?P<class>.*?)</p>'
                    r'.*?<a href="(?P<url>magnet:\?xt=urn:btih:.*?")>',re.S)
 for i in data1:
    for j in i:
        if type(j) == str:
            resp2 = requests.get(domain + j, headers=UA, verify=False)
        elif type(j) == list:
            resp2 = requests.get(domain + j[0], headers=UA, verify=False)
        print(j)
        resp2.close()
        resp2.encoding = "gb2312"
        data2 = rules2.findall(resp2.text)
        if len(data2) > 0:
            print(str(data2))
            file.write(str(data2))
            file.write('\n')
 file.close()
--- a/python/spider/Movie/douban.py
+++ b/python/spider/Movie/douban.py
--- a/python/spider/Movie/lishiping.py
+++ b/python/spider/Movie/lishiping.py
@ -0,0 +1,32 @@
 import re
 import requests
 url = "https://www.pearvideo.com/video_1792985"
 conID = url.split("_")[1]
 videourl = f"https://www.pearvideo.com/videoStatus.jsp?contId={conID}"
 header = {"Referer": url,
          "User-Agent":
              "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36"
          }
 resp = requests.get(videourl, headers=header)
 dic = resp.json()
 srcUrl = dic["videoInfo"]["videos"]["srcUrl"]
 systemTime = dic["systemTime"]
 downloadurl = srcUrl.replace(systemTime,f"cont-{conID}")
 url =requests.get(downloadurl, headers=header)
 file = open("lsp.mp4","wb+")
 file.write(url.content)
 file.close()