In this Notebook we will analyse some of the tables of data at https://en.wikipedia.org/wiki/Demographics_of_India relating to the demographics of India.
First, we will obtain a local copy of the HTML file so we don't have to keep fetching it from the internet. We will save it in a subdirectory specified by the variable HTML_DIR
.
import os
import urllib
# The directory we're going to save local copies of the HTML files into.
HTML_DIR = 'html'
if not os.path.exists(HTML_DIR):
os.mkdir(HTML_DIR)
def get_htmlpath(filename):
"""Get qualified path to local HTML file filename."""
return os.path.join(HTML_DIR, filename)
def fetch_html(url, filename):
"""Fetch HTML file for continent from internet and save as filename."""
print('Fetching HTML file from', url, '...')
req = urllib.request.urlopen(url)
html = req.read().decode()
filepath = get_htmlpath(filename)
print('Saving as', filepath, '...')
with open(filepath, 'w') as fo:
fo.write(html)
url = 'https://en.wikipedia.org/wiki/Demographics_of_India'
filename = 'india-demographics.html'
fetch_html(url, filename)
# Check the file is there:
!ls -l $HTML_DIR
We're in business. First some imports and configuration for our Jupyter session:
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# I want to always display warnings
import warnings
np.seterr(all='warn')
warnings.simplefilter("always")
Now read the data tables into a list of pandas DataFrames
:
data = pd.read_html(os.path.join(HTML_DIR, filename))
print(f'Read in {len(data)} HTML tables.')
The first one I'm interested in lists the some population distribution by states and union territories and turns out to be at index 8. We'll make a copy into the object df
so that we can always check it against the original if we need to. We'll also do some tidying in the following lines:
# Population distribution by states / union territories (2011)
df = data[8].copy()
df.set_index('State/UT', inplace=True)
# Match index labels against strings that end in " (UT)" and strip this part:
df.rename({e: re.sub(r'(.+)\s\(UT\)$',r'\1', e) for e in df.index}, inplace=True)
df.drop(['Rank', 'Percent (%)', 'Population[52]', 'Difference between male and female',
'Sex Ratio', 'Density (per km2)'], axis='columns', inplace=True)
df.rename({'Rural[53]': 'Rural', 'Urban[53]': 'Urban', 'Area[54] (km2)': 'Area (km2)',
'Male': 'Male Population', 'Female': 'Female Population'},
axis='columns', inplace=True)
df.drop('Total (India)', inplace=True)
df
Now let's recreate columns for each state for the total population, sex ratio (number of women per 1000 men) and population density and produce some summary statistics:
df['Population'] = df['Male Population'] + df['Female Population']
df['Sex Ratio'] = df['Female Population'] / df['Male Population'] * 1000
df['Population density (km-2)'] = df['Population'] / df['Area (km2)']
total_pop = df['Population'].sum()
print('Total population of India (2011): {:,}'.format(total_pop))
state_sr_min = df['Sex Ratio'].idxmin()
print('Lowest sex ratio: F:M * 1000 = {:.1f} ({})'.format(df.loc[state_sr_min]['Sex Ratio'], state_sr_min))
state_sr_max = df['Sex Ratio'].idxmax()
print('Highest sex ratio: F:M * 1000 = {:.1f} ({})'.format(df.loc[state_sr_max]['Sex Ratio'], state_sr_max))
state_least_dense = df['Population density (km-2)'].idxmin()
print('Lowest population density: {:.1f} km-2 ({})'.format(
df.loc[state_least_dense]['Population density (km-2)'], state_least_dense))
mean_pop_density = total_pop / df['Area (km2)'].sum()
print('Aveage population density of India: {:.1f} km-2'.format(mean_pop_density))
Next, let's look at the table on literacy rates, which turns out to be the one indexed at 16. We'll do some more cleaning after obtaining a copy of the DataFrame
first:
df_lit = data[16][:-1].copy()
df_lit.set_index('State or UT', inplace=True)
df_lit.rename({e: re.sub(r'(.+)\s\(UT\)',r'\1', e) for e in df.index}, inplace=True)
df_lit.rename(index={'Andhra Pradesh[74]': 'Andhra Pradesh'}, inplace=True)
df_lit.drop('State or UT code', axis='columns', inplace=True)
df_lit
We'd like to combine this table with the existing one, df
, which is also indexed by state / UT name:
df[['Male Literacy (%)', 'Female Literacy (%)']] = df_lit[['Male (%)', 'Female (%)']]
df
We can see we have a problem: there's no literacy data in our table for the state of Telangana. If we want the total literacy rate for the whole of India, we cannot therefore simply take a population-weighted sum of the literacy columns:
weights = df['Male Population'] / df['Male Population'].sum()
(df['Male Literacy (%)'] * weights).sum() # wrong
Instead, we need to form the weighted sum over only the population rows which are not null in the literacy column. This gives:
male_weights = df['Male Population'][df['Male Literacy (%)'].notnull()].sum()
total_male_literacy = (df['Male Literacy (%)'] * df['Male Population'] / male_weights).sum()
female_weights = df['Female Population'][df['Female Literacy (%)'].notnull()].sum()
total_female_literacy = (df['Female Literacy (%)'] * df['Female Population'] / female_weights).sum()
print('Total male literacy rate: {:.1f} %'.format(total_male_literacy))
print('Total female literacy rate: {:.1f} %'.format(total_female_literacy))
The Wikipedia page at https://en.wikipedia.org/wiki/List_of_Indian_states_and_union_territories_by_GDP provides a table of nominal GSDP (gross state domestic product) for the years since 2011.
Again, we'll start by taking a local copy:
url = 'https://en.wikipedia.org/wiki/List_of_Indian_states_and_union_territories_by_GDP'
gdsp_filename = 'india-gdsp.html'
fetch_html(url, gdsp_filename)
!ls -l $HTML_DIR
# NB missing values appear in the tables as '–'
gdsp_data = pd.read_html(os.path.join(HTML_DIR, gdsp_filename), na_values='–')
print(f'Read in {len(gdsp_data)} HTML tables.')
More tidying:
gdf = gdsp_data[3]
gdf.set_index('State/Union territory', inplace=True)
gdf.index.name = 'State/UT'
gdf.drop('India', inplace=True)
gdf.dropna(axis='columns', inplace=True)
df['GSDP (₹ crore)'] = gdf.iloc[:,0]
A more useful measure would be the per capita GSDP:
df['GSDP per head (₹)'] = df['GSDP (₹ crore)'] / df['Population'] * 1.e7
Let's plot some statistics:
df.plot.scatter('Female Literacy (%)', 'GSDP per head (₹)')
df.plot.scatter('Sex Ratio', 'GSDP per head (₹)')
df.plot.scatter('Sex Ratio', 'Female Literacy (%)')
url = 'https://en.wikipedia.org/wiki/List_of_states_and_union_territories_of_India_by_crime_rate'
crime_filename = 'india-crime.html'
fetch_html(url, crime_filename)
crime_data = pd.read_html(os.path.join(HTML_DIR, crime_filename), na_values='–')
print(f'Read in {len(crime_data)} HTML tables.')
cdf = crime_data[1]
cdf.set_index('State/UT', inplace=True)
df['Crime Rate'] = cdf['Rate of Cognizable Crimes (2016)']
df.plot.scatter('GSDP per head (₹)', 'Crime Rate')
url = 'https://en.wikipedia.org/wiki/List_of_states_and_union_territories_of_India_by_fertility_rate'
fertility_filename = 'india-fertility.html'
fetch_html(url, fertility_filename)
fertility_data = pd.read_html(os.path.join(HTML_DIR, fertility_filename), na_values='-')
print(f'Read in {len(fertility_data)} HTML tables.')
This one requires quite a bit of cleaning and sorting.
fdf = fertility_data[0].copy()
fdf.set_index('State/UT', inplace=True)
fdf.drop('India', inplace=True)
fdf.rename({'Andaman & Nicobar': 'Andaman and Nicobar Islands', 'Daman & Diu': 'Daman and Diu',
'Puduchery': 'Puducherry', 'Dadra Nagar Haveli': 'Dadra and Nagar Haveli'}, inplace=True)
df['Fertility Rate'] = pd.to_numeric(fdf['Fertility rate 2009 [2]'].replace(r'\*', '', regex=True))
df
Is there an (inverse) correlation between female literacy and ferility?
df['Female Literacy (%)'].corr(df['Fertility Rate'])
df.plot.scatter('Female Literacy (%)', 'Fertility Rate')
As expected, there is a strong anti-correlation of fertility rate with female literacy. Note: there is no reason to expect the relationship to be linear, however.