天气数据分析

This commit is contained in:
lsy2246 2024-06-03 17:21:05 +08:00
parent 83bceae10a
commit 14a52704a6
5 changed files with 60 additions and 80 deletions

View File

@ -1,4 +0,0 @@
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.

View File

@ -1,76 +0,0 @@
import scrapy
from urllib.parse import urljoin
from ..items import WeatherItem
class WeatherSpider(scrapy.Spider):
name = "weather"
allowed_domains = ["www.weather.com.cn"]
start_urls = ["http://www.weather.com.cn/textFC/hb.shtml"]
def parse(self, response):
areas = response.xpath('/html/body/div[4]/div[2]/div/div/ul[1]/li')
for area in areas:
area_suffix = area.xpath('.//a/@href').extract_first()
area_url = urljoin(response.url, area_suffix)
yield scrapy.Request(area_url, callback=self.each_page)
def each_page(self, response):
all_days_info = response.xpath("//div[@class='hanml']/div[not(@style='display: none;')]//table")
region = response.xpath('//title/text()').get().replace('天气预报', '')
print(f"正在爬取{region}")
for days_info in all_days_info:
info_trs = days_info.css('tr')
data_time = info_trs[0].xpath(".//td[3]/text()").extract_first().replace('白天', '').replace('夜间', '')
provincial = info_trs[2].xpath(".//td[1]/a/text()").extract_first()
loop_count = 1
for info_tr in info_trs:
loop_count += 1
if loop_count < 4:
continue
if loop_count == 4:
city = info_tr.xpath(".//td[2]/a/text()").extract_first()
weather_1 = info_tr.xpath(".//td[3]/text()").extract_first()
wind_direction_1 = info_tr.xpath(".//td[4]/span[1]/text()").extract_first()
wind_power_1 = info_tr.xpath(".//td[4]/span[2]/text()").extract_first()
temperature_1 = (info_tr.xpath(".//td[5]/text()").extract_first())
weather_2 = info_tr.xpath(".//td[6]/text()").extract_first()
wind_direction_2 = info_tr.xpath(".//td[7]/span[1]/text()").extract_first()
wind_power_2 = info_tr.xpath(".//td[7]/span[2]/text()").extract_first()
temperature_2 = (info_tr.xpath(".//td[8]/text()").extract_first())
else:
city = info_tr.xpath(".//td[1]/a/text()").extract_first()
weather_1 = info_tr.xpath(".//td[2]/text()").extract_first()
wind_direction_1 = info_tr.xpath(".//td[3]/span[1]/text()").extract_first()
wind_power_1 = info_tr.xpath(".//td[3]/span[2]/text()").extract_first()
temperature_1 = (info_tr.xpath(".//td[4]/text()").extract_first())
weather_2 = info_tr.xpath(".//td[5]/text()").extract_first()
wind_direction_2 = info_tr.xpath(".//td[6]/span[1]/text()").extract_first()
wind_power_2 = info_tr.xpath(".//td[6]/span[2]/text()").extract_first()
temperature_2 = (info_tr.xpath(".//td[7]/text()").extract_first())
weather_item = WeatherItem()
weather_item['region'] = region
weather_item['provincial'] = provincial
weather_item['data_time'] = data_time
weather_item['city'] = city
weather_item['weather_1'] = weather_1
weather_item['weather_2'] = weather_2
weather_item['wind_direction_1'] = wind_direction_1
weather_item['wind_power_1'] = wind_power_1
weather_item['temperature_1'] = temperature_1
weather_item['weather_2'] = weather_2
weather_item['wind_direction_2'] = wind_direction_2
weather_item['wind_power_2'] = wind_power_2
weather_item['temperature_2'] = temperature_2
yield weather_item

View File

@ -0,0 +1,60 @@
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
_data = {}
file_path = 'weather.xlsx'
_xls = pd.ExcelFile(file_path, engine='openpyxl')
for sheet_name in _xls.sheet_names:
_area = sheet_name
if _area not in _data:
_data[_area] = {}
_xls_content = pd.read_excel(file_path, _area)
for _index, _single in _xls_content.iterrows():
_data_time = _single['日期时间']
if _data_time not in _data[_area]:
_data[_area][_data_time] = {'白天温度': [], '夜间温度': []}
_data_day = _single['白天温度']
_data_night = _single['夜间温度']
_data[_area][_data_time]['白天温度'].append(_data_day)
_data[_area][_data_time]['夜间温度'].append(_data_night)
last_dick = {}
for _area, _date_time in _data.items():
last_dick[_area] = {"时间": [], "白天": [], "夜间": []}
for _data_single_time, _data_single in _date_time.items():
if _data_single_time not in last_dick[_area]['时间']:
last_dick[_area]['时间'].append(_data_single_time)
np_day = np.array(_data_single['白天温度']).mean()
np_night = np.array(_data_single['夜间温度']).mean()
last_dick[_area]['白天'].append(float(np_day))
last_dick[_area]['夜间'].append(float(np_night))
_fig, _axes = plt.subplots(nrows=len(last_dick), ncols=1, figsize=(10, 30))
plt.rcParams['font.sans-serif'] = ['SimHei']
if len(last_dick) == 1:
_axes = [_axes]
for _ax, (_area, _data_info) in zip(_axes, last_dick.items()):
_data_s = _data_info['时间']
_day_s = _data_info['白天']
_night_s = _data_info['夜间']
_ax.plot(_data_s, _day_s, label='白天', marker='o')
for _x, _y in zip(_data_s, _day_s):
_ax.annotate(f'{_y:.1f}°', (_x, _y), xytext=(5, 10), textcoords='offset points', ha='center')
for _x, _y in zip(_data_s, _night_s):
_ax.annotate(f'{_y:.1f}°', (_x, _y), xytext=(5, -15), textcoords='offset points', ha='center')
_ax.plot(_data_s, _night_s, label='夜间', marker='o')
_ax.set_title(_area)
_ax.set_yticks(range(0, 45, 5))
_ax.set_yticklabels([f"{i}°" for i in range(0, 45, 5)])
_ax.grid(alpha=0.5, linestyle='--')
_ax.legend()
_ax.set_xlim([_data_s[0], _data_s[len(_data_s) - 1]])
_ax.text(_ax.get_xlim()[0], _ax.get_ylim()[1]+0.2, f"最高温度{np.max(_day_s):.1f}°")
_ax.text(_ax.get_xlim()[0]+0.8, _ax.get_ylim()[1]+0.2, f"最低温度{np.min(_night_s):.1f}°")
_ax.text(_ax.get_xlim()[0]+1.6, _ax.get_ylim()[1]+0.2, f"平均温度{np.mean(_night_s):.1f}°")
_fig.savefig('weather_matplotlib.svg', bbox_inches='tight', transparent=True)
plt.show()