diff --git a/python/spider/info/weather/scrapy.cfg b/python/spider/info/weather/scrapy.cfg new file mode 100644 index 0000000..e57f3b9 --- /dev/null +++ b/python/spider/info/weather/scrapy.cfg @@ -0,0 +1,11 @@ +# Automatically created by: scrapy startproject +# +# For more information about the [deploy] section see: +# https://scrapyd.readthedocs.io/en/latest/deploy.html + +[settings] +default = tianqi.settings + +[deploy] +#url = http://localhost:6800/ +project = tianqi diff --git a/python/spider/info/weather/tianqi/__init__.py b/python/spider/info/weather/tianqi/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/python/spider/info/weather/tianqi/items.py b/python/spider/info/weather/tianqi/items.py new file mode 100644 index 0000000..d0035d4 --- /dev/null +++ b/python/spider/info/weather/tianqi/items.py @@ -0,0 +1,28 @@ +# Define here the models for your scraped items +# +# See documentation in: +# https://docs.scrapy.org/en/latest/topics/items.html + +import scrapy + + +class TianqiItem(scrapy.Item): + # define the fields for your item here like: + # name = scrapy.Field() + pass + +class WeatherItem(scrapy.Item): + region = scrapy.Field() + provincial = scrapy.Field() + data_time = scrapy.Field() + city = scrapy.Field() + + weather_1 = scrapy.Field() + wind_direction_1 = scrapy.Field() + wind_power_1 = scrapy.Field() + temperature_1 = scrapy.Field() + + weather_2 = scrapy.Field() + wind_direction_2 = scrapy.Field() + wind_power_2 = scrapy.Field() + temperature_2 = scrapy.Field() \ No newline at end of file diff --git a/python/spider/info/weather/tianqi/middlewares.py b/python/spider/info/weather/tianqi/middlewares.py new file mode 100644 index 0000000..504cf5f --- /dev/null +++ b/python/spider/info/weather/tianqi/middlewares.py @@ -0,0 +1,103 @@ +# Define here the models for your spider middleware +# +# See documentation in: +# https://docs.scrapy.org/en/latest/topics/spider-middleware.html + +from scrapy import signals + +# useful for handling different item types with a single interface +from itemadapter import is_item, ItemAdapter + + +class TianqiSpiderMiddleware: + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the spider middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_spider_input(self, response, spider): + # Called for each response that goes through the spider + # middleware and into the spider. + + # Should return None or raise an exception. + return None + + def process_spider_output(self, response, result, spider): + # Called with the results returned from the Spider, after + # it has processed the response. + + # Must return an iterable of Request, or item objects. + for i in result: + yield i + + def process_spider_exception(self, response, exception, spider): + # Called when a spider or process_spider_input() method + # (from other spider middleware) raises an exception. + + # Should return either None or an iterable of Request or item objects. + pass + + def process_start_requests(self, start_requests, spider): + # Called with the start requests of the spider, and works + # similarly to the process_spider_output() method, except + # that it doesn’t have a response associated. + + # Must return only requests (not items). + for r in start_requests: + yield r + + def spider_opened(self, spider): + spider.logger.info("Spider opened: %s" % spider.name) + + +class TianqiDownloaderMiddleware: + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the downloader middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_request(self, request, spider): + # Called for each request that goes through the downloader + # middleware. + + # Must either: + # - return None: continue processing this request + # - or return a Response object + # - or return a Request object + # - or raise IgnoreRequest: process_exception() methods of + # installed downloader middleware will be called + return None + + def process_response(self, request, response, spider): + # Called with the response returned from the downloader. + + # Must either; + # - return a Response object + # - return a Request object + # - or raise IgnoreRequest + return response + + def process_exception(self, request, exception, spider): + # Called when a download handler or a process_request() + # (from other downloader middleware) raises an exception. + + # Must either: + # - return None: continue processing this exception + # - return a Response object: stops process_exception() chain + # - return a Request object: stops process_exception() chain + pass + + def spider_opened(self, spider): + spider.logger.info("Spider opened: %s" % spider.name) diff --git a/python/spider/info/weather/tianqi/pipelines.py b/python/spider/info/weather/tianqi/pipelines.py new file mode 100644 index 0000000..506e9ea --- /dev/null +++ b/python/spider/info/weather/tianqi/pipelines.py @@ -0,0 +1,45 @@ +# Define your item pipelines here +# +# Don't forget to add your pipeline to the ITEM_PIPELINES setting +# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html + + +# useful for handling different item types with a single interface +from itemadapter import ItemAdapter +from openpyxl import Workbook + + +class TianqiPipeline: + def process_item(self, item, spider): + return item + + +class ExcelPipeline: + def __init__(self): + self.workbook = Workbook() + self.worksheets = {} + + def open_spider(self, spider): + self.workbook = Workbook() + self.workbook.remove(self.workbook.active) + + def close_spider(self, spider): + self.workbook.save('weather.xlsx') + self.workbook.close() + + def process_item(self, item, spider): + region = item['region'] + if region not in self.worksheets: + ws = self.workbook.create_sheet(title=region) + # 定义列标题的顺序 + ws.append(['省份', '日期时间', '城市', '白天温度', '白天天气', '白天风向', '白天风力', + '夜间温度', '夜间天气', '夜间风向', '夜间风力']) + self.worksheets[region] = ws + ws = self.worksheets[region] + # 根据标题顺序将数据添加到对应列 + ws.append([ + item['provincial'], item['data_time'], item['city'], + item['temperature_1'], item['weather_1'], item['wind_direction_1'], item['wind_power_1'], + item['temperature_2'], item['weather_2'], item['wind_direction_2'], item['wind_power_2'] + ]) + return item diff --git a/python/spider/info/weather/tianqi/scrapy.cfg b/python/spider/info/weather/tianqi/scrapy.cfg new file mode 100644 index 0000000..e57f3b9 --- /dev/null +++ b/python/spider/info/weather/tianqi/scrapy.cfg @@ -0,0 +1,11 @@ +# Automatically created by: scrapy startproject +# +# For more information about the [deploy] section see: +# https://scrapyd.readthedocs.io/en/latest/deploy.html + +[settings] +default = tianqi.settings + +[deploy] +#url = http://localhost:6800/ +project = tianqi diff --git a/python/spider/info/weather/tianqi/settings.py b/python/spider/info/weather/tianqi/settings.py new file mode 100644 index 0000000..15434ce --- /dev/null +++ b/python/spider/info/weather/tianqi/settings.py @@ -0,0 +1,95 @@ +# Scrapy settings for tianqi project +# +# For simplicity, this file contains only settings considered important or +# commonly used. You can find more settings consulting the documentation: +# +# https://docs.scrapy.org/en/latest/topics/settings.html +# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html +# https://docs.scrapy.org/en/latest/topics/spider-middleware.html + +BOT_NAME = "tianqi" + +SPIDER_MODULES = ["tianqi.spiders"] +NEWSPIDER_MODULE = "tianqi.spiders" + +LOG_LEVEL = "WARNING" +# Crawl responsibly by identifying yourself (and your website) on the user-agent +USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36" + +# Obey robots.txt rules +ROBOTSTXT_OBEY = False + +# Configure maximum concurrent requests performed by Scrapy (default: 16) +# CONCURRENT_REQUESTS = 32 + +# Configure a delay for requests for the same website (default: 0) +# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay +# See also autothrottle settings and docs +DOWNLOAD_DELAY = 2 +RANDOMIZE_DOWNLOAD_DELAY = True +# The download delay setting will honor only one of: +# CONCURRENT_REQUESTS_PER_DOMAIN = 16 +# CONCURRENT_REQUESTS_PER_IP = 16 + +# Disable cookies (enabled by default) +# COOKIES_ENABLED = False + +# Disable Telnet Console (enabled by default) +# TELNETCONSOLE_ENABLED = False + +# Override the default request headers: +# DEFAULT_REQUEST_HEADERS = { +# "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", +# "Accept-Language": "en", +# } + +# Enable or disable spider middlewares +# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html +# SPIDER_MIDDLEWARES = { +# "tianqi.middlewares.TianqiSpiderMiddleware": 543, +# } + +# Enable or disable downloader middlewares +# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html +# DOWNLOADER_MIDDLEWARES = { +# "tianqi.middlewares.TianqiDownloaderMiddleware": 543, +# } + +# Enable or disable extensions +# See https://docs.scrapy.org/en/latest/topics/extensions.html +# EXTENSIONS = { +# "scrapy.extensions.telnet.TelnetConsole": None, +# } + +# Configure item pipelines +# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html +ITEM_PIPELINES = { + "tianqi.pipelines.TianqiPipeline": 300, + "tianqi.pipelines.ExcelPipeline": 301, +} + +# Enable and configure the AutoThrottle extension (disabled by default) +# See https://docs.scrapy.org/en/latest/topics/autothrottle.html +# AUTOTHROTTLE_ENABLED = True +# The initial download delay +# AUTOTHROTTLE_START_DELAY = 5 +# The maximum download delay to be set in case of high latencies +# AUTOTHROTTLE_MAX_DELAY = 60 +# The average number of requests Scrapy should be sending in parallel to +# each remote server +# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 +# Enable showing throttling stats for every response received: +# AUTOTHROTTLE_DEBUG = False + +# Enable and configure HTTP caching (disabled by default) +# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings +# HTTPCACHE_ENABLED = True +# HTTPCACHE_EXPIRATION_SECS = 0 +# HTTPCACHE_DIR = "httpcache" +# HTTPCACHE_IGNORE_HTTP_CODES = [] +# HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage" + +# Set settings whose default value is deprecated to a future-proof value +REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7" +TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor" +FEED_EXPORT_ENCODING = "utf-8" diff --git a/python/spider/info/weather/tianqi/spiders/__init__.py b/python/spider/info/weather/tianqi/spiders/__init__.py new file mode 100644 index 0000000..ebd689a --- /dev/null +++ b/python/spider/info/weather/tianqi/spiders/__init__.py @@ -0,0 +1,4 @@ +# This package will contain the spiders of your Scrapy project +# +# Please refer to the documentation for information on how to create and manage +# your spiders. diff --git a/python/spider/info/weather/tianqi/spiders/weather.py b/python/spider/info/weather/tianqi/spiders/weather.py new file mode 100644 index 0000000..51076bc --- /dev/null +++ b/python/spider/info/weather/tianqi/spiders/weather.py @@ -0,0 +1,76 @@ +import scrapy +from urllib.parse import urljoin +from ..items import WeatherItem + +class WeatherSpider(scrapy.Spider): + name = "weather" + allowed_domains = ["www.weather.com.cn"] + start_urls = ["http://www.weather.com.cn/textFC/hb.shtml"] + + def parse(self, response): + areas = response.xpath('/html/body/div[4]/div[2]/div/div/ul[1]/li') + for area in areas: + area_suffix = area.xpath('.//a/@href').extract_first() + area_url = urljoin(response.url, area_suffix) + yield scrapy.Request(area_url, callback=self.each_page) + + def each_page(self, response): + all_days_info = response.xpath("//div[@class='hanml']/div[not(@style='display: none;')]//table") + region = response.xpath('//title/text()').get().replace('天气预报', '') + print(f"正在爬取{region}") + for days_info in all_days_info: + info_trs = days_info.css('tr') + + data_time = info_trs[0].xpath(".//td[3]/text()").extract_first().replace('白天', '').replace('夜间', '') + provincial = info_trs[2].xpath(".//td[1]/a/text()").extract_first() + + loop_count = 1 + for info_tr in info_trs: + loop_count += 1 + if loop_count < 4: + continue + if loop_count == 4: + city = info_tr.xpath(".//td[2]/a/text()").extract_first() + + weather_1 = info_tr.xpath(".//td[3]/text()").extract_first() + wind_direction_1 = info_tr.xpath(".//td[4]/span[1]/text()").extract_first() + wind_power_1 = info_tr.xpath(".//td[4]/span[2]/text()").extract_first() + temperature_1 = (info_tr.xpath(".//td[5]/text()").extract_first()) + + weather_2 = info_tr.xpath(".//td[6]/text()").extract_first() + wind_direction_2 = info_tr.xpath(".//td[7]/span[1]/text()").extract_first() + wind_power_2 = info_tr.xpath(".//td[7]/span[2]/text()").extract_first() + temperature_2 = (info_tr.xpath(".//td[8]/text()").extract_first()) + else: + city = info_tr.xpath(".//td[1]/a/text()").extract_first() + + weather_1 = info_tr.xpath(".//td[2]/text()").extract_first() + wind_direction_1 = info_tr.xpath(".//td[3]/span[1]/text()").extract_first() + wind_power_1 = info_tr.xpath(".//td[3]/span[2]/text()").extract_first() + temperature_1 = (info_tr.xpath(".//td[4]/text()").extract_first()) + + weather_2 = info_tr.xpath(".//td[5]/text()").extract_first() + wind_direction_2 = info_tr.xpath(".//td[6]/span[1]/text()").extract_first() + wind_power_2 = info_tr.xpath(".//td[6]/span[2]/text()").extract_first() + temperature_2 = (info_tr.xpath(".//td[7]/text()").extract_first()) + weather_item = WeatherItem() + weather_item['region'] = region + weather_item['provincial'] = provincial + weather_item['data_time'] = data_time + weather_item['city'] = city + weather_item['weather_1'] = weather_1 + weather_item['weather_2'] = weather_2 + weather_item['wind_direction_1'] = wind_direction_1 + weather_item['wind_power_1'] = wind_power_1 + weather_item['temperature_1'] = temperature_1 + weather_item['weather_2'] = weather_2 + weather_item['wind_direction_2'] = wind_direction_2 + weather_item['wind_power_2'] = wind_power_2 + weather_item['temperature_2'] = temperature_2 + yield weather_item + + + + + + diff --git a/python/spider/info/weather/tianqi/tianqi/__init__.py b/python/spider/info/weather/tianqi/tianqi/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/python/spider/info/weather/tianqi/tianqi/items.py b/python/spider/info/weather/tianqi/tianqi/items.py new file mode 100644 index 0000000..d0035d4 --- /dev/null +++ b/python/spider/info/weather/tianqi/tianqi/items.py @@ -0,0 +1,28 @@ +# Define here the models for your scraped items +# +# See documentation in: +# https://docs.scrapy.org/en/latest/topics/items.html + +import scrapy + + +class TianqiItem(scrapy.Item): + # define the fields for your item here like: + # name = scrapy.Field() + pass + +class WeatherItem(scrapy.Item): + region = scrapy.Field() + provincial = scrapy.Field() + data_time = scrapy.Field() + city = scrapy.Field() + + weather_1 = scrapy.Field() + wind_direction_1 = scrapy.Field() + wind_power_1 = scrapy.Field() + temperature_1 = scrapy.Field() + + weather_2 = scrapy.Field() + wind_direction_2 = scrapy.Field() + wind_power_2 = scrapy.Field() + temperature_2 = scrapy.Field() \ No newline at end of file diff --git a/python/spider/info/weather/tianqi/tianqi/middlewares.py b/python/spider/info/weather/tianqi/tianqi/middlewares.py new file mode 100644 index 0000000..504cf5f --- /dev/null +++ b/python/spider/info/weather/tianqi/tianqi/middlewares.py @@ -0,0 +1,103 @@ +# Define here the models for your spider middleware +# +# See documentation in: +# https://docs.scrapy.org/en/latest/topics/spider-middleware.html + +from scrapy import signals + +# useful for handling different item types with a single interface +from itemadapter import is_item, ItemAdapter + + +class TianqiSpiderMiddleware: + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the spider middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_spider_input(self, response, spider): + # Called for each response that goes through the spider + # middleware and into the spider. + + # Should return None or raise an exception. + return None + + def process_spider_output(self, response, result, spider): + # Called with the results returned from the Spider, after + # it has processed the response. + + # Must return an iterable of Request, or item objects. + for i in result: + yield i + + def process_spider_exception(self, response, exception, spider): + # Called when a spider or process_spider_input() method + # (from other spider middleware) raises an exception. + + # Should return either None or an iterable of Request or item objects. + pass + + def process_start_requests(self, start_requests, spider): + # Called with the start requests of the spider, and works + # similarly to the process_spider_output() method, except + # that it doesn’t have a response associated. + + # Must return only requests (not items). + for r in start_requests: + yield r + + def spider_opened(self, spider): + spider.logger.info("Spider opened: %s" % spider.name) + + +class TianqiDownloaderMiddleware: + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the downloader middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_request(self, request, spider): + # Called for each request that goes through the downloader + # middleware. + + # Must either: + # - return None: continue processing this request + # - or return a Response object + # - or return a Request object + # - or raise IgnoreRequest: process_exception() methods of + # installed downloader middleware will be called + return None + + def process_response(self, request, response, spider): + # Called with the response returned from the downloader. + + # Must either; + # - return a Response object + # - return a Request object + # - or raise IgnoreRequest + return response + + def process_exception(self, request, exception, spider): + # Called when a download handler or a process_request() + # (from other downloader middleware) raises an exception. + + # Must either: + # - return None: continue processing this exception + # - return a Response object: stops process_exception() chain + # - return a Request object: stops process_exception() chain + pass + + def spider_opened(self, spider): + spider.logger.info("Spider opened: %s" % spider.name) diff --git a/python/spider/info/weather/tianqi/tianqi/pipelines.py b/python/spider/info/weather/tianqi/tianqi/pipelines.py new file mode 100644 index 0000000..506e9ea --- /dev/null +++ b/python/spider/info/weather/tianqi/tianqi/pipelines.py @@ -0,0 +1,45 @@ +# Define your item pipelines here +# +# Don't forget to add your pipeline to the ITEM_PIPELINES setting +# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html + + +# useful for handling different item types with a single interface +from itemadapter import ItemAdapter +from openpyxl import Workbook + + +class TianqiPipeline: + def process_item(self, item, spider): + return item + + +class ExcelPipeline: + def __init__(self): + self.workbook = Workbook() + self.worksheets = {} + + def open_spider(self, spider): + self.workbook = Workbook() + self.workbook.remove(self.workbook.active) + + def close_spider(self, spider): + self.workbook.save('weather.xlsx') + self.workbook.close() + + def process_item(self, item, spider): + region = item['region'] + if region not in self.worksheets: + ws = self.workbook.create_sheet(title=region) + # 定义列标题的顺序 + ws.append(['省份', '日期时间', '城市', '白天温度', '白天天气', '白天风向', '白天风力', + '夜间温度', '夜间天气', '夜间风向', '夜间风力']) + self.worksheets[region] = ws + ws = self.worksheets[region] + # 根据标题顺序将数据添加到对应列 + ws.append([ + item['provincial'], item['data_time'], item['city'], + item['temperature_1'], item['weather_1'], item['wind_direction_1'], item['wind_power_1'], + item['temperature_2'], item['weather_2'], item['wind_direction_2'], item['wind_power_2'] + ]) + return item diff --git a/python/spider/info/weather/tianqi/tianqi/settings.py b/python/spider/info/weather/tianqi/tianqi/settings.py new file mode 100644 index 0000000..15434ce --- /dev/null +++ b/python/spider/info/weather/tianqi/tianqi/settings.py @@ -0,0 +1,95 @@ +# Scrapy settings for tianqi project +# +# For simplicity, this file contains only settings considered important or +# commonly used. You can find more settings consulting the documentation: +# +# https://docs.scrapy.org/en/latest/topics/settings.html +# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html +# https://docs.scrapy.org/en/latest/topics/spider-middleware.html + +BOT_NAME = "tianqi" + +SPIDER_MODULES = ["tianqi.spiders"] +NEWSPIDER_MODULE = "tianqi.spiders" + +LOG_LEVEL = "WARNING" +# Crawl responsibly by identifying yourself (and your website) on the user-agent +USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36" + +# Obey robots.txt rules +ROBOTSTXT_OBEY = False + +# Configure maximum concurrent requests performed by Scrapy (default: 16) +# CONCURRENT_REQUESTS = 32 + +# Configure a delay for requests for the same website (default: 0) +# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay +# See also autothrottle settings and docs +DOWNLOAD_DELAY = 2 +RANDOMIZE_DOWNLOAD_DELAY = True +# The download delay setting will honor only one of: +# CONCURRENT_REQUESTS_PER_DOMAIN = 16 +# CONCURRENT_REQUESTS_PER_IP = 16 + +# Disable cookies (enabled by default) +# COOKIES_ENABLED = False + +# Disable Telnet Console (enabled by default) +# TELNETCONSOLE_ENABLED = False + +# Override the default request headers: +# DEFAULT_REQUEST_HEADERS = { +# "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", +# "Accept-Language": "en", +# } + +# Enable or disable spider middlewares +# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html +# SPIDER_MIDDLEWARES = { +# "tianqi.middlewares.TianqiSpiderMiddleware": 543, +# } + +# Enable or disable downloader middlewares +# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html +# DOWNLOADER_MIDDLEWARES = { +# "tianqi.middlewares.TianqiDownloaderMiddleware": 543, +# } + +# Enable or disable extensions +# See https://docs.scrapy.org/en/latest/topics/extensions.html +# EXTENSIONS = { +# "scrapy.extensions.telnet.TelnetConsole": None, +# } + +# Configure item pipelines +# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html +ITEM_PIPELINES = { + "tianqi.pipelines.TianqiPipeline": 300, + "tianqi.pipelines.ExcelPipeline": 301, +} + +# Enable and configure the AutoThrottle extension (disabled by default) +# See https://docs.scrapy.org/en/latest/topics/autothrottle.html +# AUTOTHROTTLE_ENABLED = True +# The initial download delay +# AUTOTHROTTLE_START_DELAY = 5 +# The maximum download delay to be set in case of high latencies +# AUTOTHROTTLE_MAX_DELAY = 60 +# The average number of requests Scrapy should be sending in parallel to +# each remote server +# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 +# Enable showing throttling stats for every response received: +# AUTOTHROTTLE_DEBUG = False + +# Enable and configure HTTP caching (disabled by default) +# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings +# HTTPCACHE_ENABLED = True +# HTTPCACHE_EXPIRATION_SECS = 0 +# HTTPCACHE_DIR = "httpcache" +# HTTPCACHE_IGNORE_HTTP_CODES = [] +# HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage" + +# Set settings whose default value is deprecated to a future-proof value +REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7" +TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor" +FEED_EXPORT_ENCODING = "utf-8" diff --git a/python/spider/info/weather/tianqi/tianqi/spiders/__init__.py b/python/spider/info/weather/tianqi/tianqi/spiders/__init__.py new file mode 100644 index 0000000..ebd689a --- /dev/null +++ b/python/spider/info/weather/tianqi/tianqi/spiders/__init__.py @@ -0,0 +1,4 @@ +# This package will contain the spiders of your Scrapy project +# +# Please refer to the documentation for information on how to create and manage +# your spiders. diff --git a/python/spider/info/weather/tianqi/tianqi/spiders/weather.py b/python/spider/info/weather/tianqi/tianqi/spiders/weather.py new file mode 100644 index 0000000..51076bc --- /dev/null +++ b/python/spider/info/weather/tianqi/tianqi/spiders/weather.py @@ -0,0 +1,76 @@ +import scrapy +from urllib.parse import urljoin +from ..items import WeatherItem + +class WeatherSpider(scrapy.Spider): + name = "weather" + allowed_domains = ["www.weather.com.cn"] + start_urls = ["http://www.weather.com.cn/textFC/hb.shtml"] + + def parse(self, response): + areas = response.xpath('/html/body/div[4]/div[2]/div/div/ul[1]/li') + for area in areas: + area_suffix = area.xpath('.//a/@href').extract_first() + area_url = urljoin(response.url, area_suffix) + yield scrapy.Request(area_url, callback=self.each_page) + + def each_page(self, response): + all_days_info = response.xpath("//div[@class='hanml']/div[not(@style='display: none;')]//table") + region = response.xpath('//title/text()').get().replace('天气预报', '') + print(f"正在爬取{region}") + for days_info in all_days_info: + info_trs = days_info.css('tr') + + data_time = info_trs[0].xpath(".//td[3]/text()").extract_first().replace('白天', '').replace('夜间', '') + provincial = info_trs[2].xpath(".//td[1]/a/text()").extract_first() + + loop_count = 1 + for info_tr in info_trs: + loop_count += 1 + if loop_count < 4: + continue + if loop_count == 4: + city = info_tr.xpath(".//td[2]/a/text()").extract_first() + + weather_1 = info_tr.xpath(".//td[3]/text()").extract_first() + wind_direction_1 = info_tr.xpath(".//td[4]/span[1]/text()").extract_first() + wind_power_1 = info_tr.xpath(".//td[4]/span[2]/text()").extract_first() + temperature_1 = (info_tr.xpath(".//td[5]/text()").extract_first()) + + weather_2 = info_tr.xpath(".//td[6]/text()").extract_first() + wind_direction_2 = info_tr.xpath(".//td[7]/span[1]/text()").extract_first() + wind_power_2 = info_tr.xpath(".//td[7]/span[2]/text()").extract_first() + temperature_2 = (info_tr.xpath(".//td[8]/text()").extract_first()) + else: + city = info_tr.xpath(".//td[1]/a/text()").extract_first() + + weather_1 = info_tr.xpath(".//td[2]/text()").extract_first() + wind_direction_1 = info_tr.xpath(".//td[3]/span[1]/text()").extract_first() + wind_power_1 = info_tr.xpath(".//td[3]/span[2]/text()").extract_first() + temperature_1 = (info_tr.xpath(".//td[4]/text()").extract_first()) + + weather_2 = info_tr.xpath(".//td[5]/text()").extract_first() + wind_direction_2 = info_tr.xpath(".//td[6]/span[1]/text()").extract_first() + wind_power_2 = info_tr.xpath(".//td[6]/span[2]/text()").extract_first() + temperature_2 = (info_tr.xpath(".//td[7]/text()").extract_first()) + weather_item = WeatherItem() + weather_item['region'] = region + weather_item['provincial'] = provincial + weather_item['data_time'] = data_time + weather_item['city'] = city + weather_item['weather_1'] = weather_1 + weather_item['weather_2'] = weather_2 + weather_item['wind_direction_1'] = wind_direction_1 + weather_item['wind_power_1'] = wind_power_1 + weather_item['temperature_1'] = temperature_1 + weather_item['weather_2'] = weather_2 + weather_item['wind_direction_2'] = wind_direction_2 + weather_item['wind_power_2'] = wind_power_2 + weather_item['temperature_2'] = temperature_2 + yield weather_item + + + + + + diff --git a/python/spider/info/weather/tianqi/weather.xlsx b/python/spider/info/weather/tianqi/weather.xlsx new file mode 100644 index 0000000..abda57a Binary files /dev/null and b/python/spider/info/weather/tianqi/weather.xlsx differ diff --git a/python/spider/info/weather/tianqi/~$weather.xlsx b/python/spider/info/weather/tianqi/~$weather.xlsx new file mode 100644 index 0000000..1e0c8b3 Binary files /dev/null and b/python/spider/info/weather/tianqi/~$weather.xlsx differ diff --git a/python/spider/info/weather/weather.xlsx b/python/spider/info/weather/weather.xlsx new file mode 100644 index 0000000..abda57a Binary files /dev/null and b/python/spider/info/weather/weather.xlsx differ diff --git a/python/spider/info/weather/~$weather.xlsx b/python/spider/info/weather/~$weather.xlsx new file mode 100644 index 0000000..1e0c8b3 Binary files /dev/null and b/python/spider/info/weather/~$weather.xlsx differ