Filtering out the bad data points from this data set is not easy, and there is more that could be done here (particularly in respect of the rainfall measurements).
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# Read in the data and assign column names.
df = pd.read_csv('weather-raw.csv', header=None, names=['Timestamp',
'Temperature', 'Humidity', 'Dew Point', 'Pressure', 'Wind Speed',
'Wind Bearing', 'Sunshine', 'Rainfall', 'Max Wind Speed']
)
# Rescale the values according to the documentation.
# Temperature -> deg C, Dew Point -> deg C, Wind Speed -> knots,
# Sunshine -> hours, Rainfall -> mm
scalings = {10: ['Temperature', 'Dew Point', 'Wind Speed', 'Max Wind Speed'],
100: 'Sunshine',
1000: 'Rainfall'
}
for fac, cols in scalings.items():
df[cols] = df.loc(axis=1)[cols] / fac
# Turn the index into a DateTimeIndex
df.index = pd.to_datetime(df['Timestamp'])
# -- (a) --
# Wind bearing is the cardinal direction the wind is blowing *to*.
modal_wind_bearing = df['Wind Bearing'].mode().values[0]
print('Most common wind bearing: {} deg'.format(modal_wind_bearing))
# Most common wind bearing: 180.0 deg
# (ie from the north).
# -- (b) --
# There is some obviously bad data indicating implausibly high wind speeds
# over a cluster of times in 2015: filter ths out.
df.loc[df['Max Wind Speed'] > 100, 'Max Wind Speed'] = np.nan
idx = df['Max Wind Speed'].idxmax()
wind_max = df.loc[idx]['Max Wind Speed']
print('Max wind speed: {} knots at {}'.format(wind_max, idx))
# Max wind speed: 96.0 knots at 2016-01-08 16:00:00
# (well, maybe).
# -- (c) --
sunshine_monthly = df['Sunshine'].resample('M').sum()
idx = sunshine_monthly[sunshine_monthly.index.month == 6].idxmax()
print('Greatest number of sunshine hours in a June: {:.1f} in {}'
.format(sunshine_monthly[idx], idx.year))
# Greatest number of sunshine hours in a June: 236.8 in 2006
# -- (d) --
rain_daily = df['Rainfall'].resample('D').sum()
idx = rain_daily.idxmax()
print('Highest rainfall: {:.2f} mm on {}'.format(rain_daily[idx], idx))
# Highest rainfall: 171.23000000000005 mm on 2011-02-26 00:00:00
# (Not likely): TODO – cross-reference rainfall data with Met Office
# recordings at:
#https://www.metoffice.gov.uk/pub/data/weather/uk/climate/stationdata/cambridgedata.txt
# -- (e) --
# There are different ways we could filter out the bad temperature data.
# Method 1: exclude data points nstd standard deviations away from the mean.
nstd = 3
tser = df['Temperature'].copy()
thresh = tser.mean() - nstd * tser.std()
tser[tser < thresh] = np.nan
idx = tser.idxmin()
print('Lowest temperature (method 1): {} deg C at {}'.format(tser[idx], idx))
# Lowest temperature (method 1): -8.8 deg C at 2010-12-20 01:00:00
# Method 2: exclude data points from an identified date range.
df.loc['2015-10-15':'2016-01-15','Temperature'] = np.nan
idx = df['Temperature'].idxmin()
print('Lowest temperature (method 2): {} deg C at {}'
.format(df.loc[idx, 'Temperature'], idx))
# Lowest temperature (method 1): -13.8 deg C at 2012-02-11 07:00:00