77 lines
3.7 KiB
Python
77 lines
3.7 KiB
Python
import scrapy
|
|
from urllib.parse import urljoin
|
|
from ..items import WeatherItem
|
|
|
|
class WeatherSpider(scrapy.Spider):
|
|
name = "weather"
|
|
allowed_domains = ["www.weather.com.cn"]
|
|
start_urls = ["http://www.weather.com.cn/textFC/hb.shtml"]
|
|
|
|
def parse(self, response):
|
|
areas = response.xpath('/html/body/div[4]/div[2]/div/div/ul[1]/li')
|
|
for area in areas:
|
|
area_suffix = area.xpath('.//a/@href').extract_first()
|
|
area_url = urljoin(response.url, area_suffix)
|
|
yield scrapy.Request(area_url, callback=self.each_page)
|
|
|
|
def each_page(self, response):
|
|
all_days_info = response.xpath("//div[@class='hanml']/div[not(@style='display: none;')]//table")
|
|
region = response.xpath('//title/text()').get().replace('天气预报', '')
|
|
print(f"正在爬取{region}")
|
|
for days_info in all_days_info:
|
|
info_trs = days_info.css('tr')
|
|
|
|
data_time = info_trs[0].xpath(".//td[3]/text()").extract_first().replace('白天', '').replace('夜间', '')
|
|
provincial = info_trs[2].xpath(".//td[1]/a/text()").extract_first()
|
|
|
|
loop_count = 1
|
|
for info_tr in info_trs:
|
|
loop_count += 1
|
|
if loop_count < 4:
|
|
continue
|
|
if loop_count == 4:
|
|
city = info_tr.xpath(".//td[2]/a/text()").extract_first()
|
|
|
|
weather_1 = info_tr.xpath(".//td[3]/text()").extract_first()
|
|
wind_direction_1 = info_tr.xpath(".//td[4]/span[1]/text()").extract_first()
|
|
wind_power_1 = info_tr.xpath(".//td[4]/span[2]/text()").extract_first()
|
|
temperature_1 = (info_tr.xpath(".//td[5]/text()").extract_first())
|
|
|
|
weather_2 = info_tr.xpath(".//td[6]/text()").extract_first()
|
|
wind_direction_2 = info_tr.xpath(".//td[7]/span[1]/text()").extract_first()
|
|
wind_power_2 = info_tr.xpath(".//td[7]/span[2]/text()").extract_first()
|
|
temperature_2 = (info_tr.xpath(".//td[8]/text()").extract_first())
|
|
else:
|
|
city = info_tr.xpath(".//td[1]/a/text()").extract_first()
|
|
|
|
weather_1 = info_tr.xpath(".//td[2]/text()").extract_first()
|
|
wind_direction_1 = info_tr.xpath(".//td[3]/span[1]/text()").extract_first()
|
|
wind_power_1 = info_tr.xpath(".//td[3]/span[2]/text()").extract_first()
|
|
temperature_1 = (info_tr.xpath(".//td[4]/text()").extract_first())
|
|
|
|
weather_2 = info_tr.xpath(".//td[5]/text()").extract_first()
|
|
wind_direction_2 = info_tr.xpath(".//td[6]/span[1]/text()").extract_first()
|
|
wind_power_2 = info_tr.xpath(".//td[6]/span[2]/text()").extract_first()
|
|
temperature_2 = (info_tr.xpath(".//td[7]/text()").extract_first())
|
|
weather_item = WeatherItem()
|
|
weather_item['region'] = region
|
|
weather_item['provincial'] = provincial
|
|
weather_item['data_time'] = data_time
|
|
weather_item['city'] = city
|
|
weather_item['weather_1'] = weather_1
|
|
weather_item['weather_2'] = weather_2
|
|
weather_item['wind_direction_1'] = wind_direction_1
|
|
weather_item['wind_power_1'] = wind_power_1
|
|
weather_item['temperature_1'] = temperature_1
|
|
weather_item['weather_2'] = weather_2
|
|
weather_item['wind_direction_2'] = wind_direction_2
|
|
weather_item['wind_power_2'] = wind_power_2
|
|
weather_item['temperature_2'] = temperature_2
|
|
yield weather_item
|
|
|
|
|
|
|
|
|
|
|
|
|