%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
print(plt.style.available)
sns.set_style("whitegrid")
plt.style.use('seaborn-whitegrid')
# https://www.theguardian.com/world/datablog/2010/jul/26/wikileaks-afghanistan-ied-attacks
# or https://pan.baidu.com/s/1jKjBXOy
df = pd.read_excel('../data/wikileaks/ExplodedIED.xls')
df.head()
len(df)
df['time'] = [str(i)[:8]+'01' for i in df.DateOccurred]
df['time'] = [i if '200' in i else np.nan for i in df.time ]
df['time'] = pd.to_datetime(df.time, format = '%Y-%m-%d')
df['year'] = [str(i)[:4] for i in df.DateOccurred]
df['year'] = [int(i) if '200' in i else np.nan for i in df.year]
df = df.dropna(subset = ['Latitude','Longitude'])
len(df)
df.columns
df.iloc[0]
plt.figure(figsize = (8, 8))
region_freq = df.groupby('Region').size()
explode = (0.2, 0.1, 0.2, 0.1, 0.2, 0.3) # only "explode" the 2nd slice (i.e. 'Hogs')
plt.pie(region_freq, labels = region_freq.index, explode = explode,
autopct='%0.1f%%',
pctdistance=0.2, shadow=True)
plt.show()
df.Region.unique()
df.Region.value_counts()
plt.figure(figsize = (8, 8))
explode = (0.2, 0.3)
df.Category.value_counts().plot(kind='pie', explode = explode,
autopct='%0.1f%%',
pctdistance=0.2, shadow=True)
plt.show()
plt.figure(figsize = (8, 8))
explode = (0.1, 0.1, 0.2, 0.1, 0.2, 0.3)
df.Region.value_counts().plot(kind='pie', explode = explode,
autopct='%0.1f%%',
pctdistance=.5, shadow=True)
plt.show()
df.time.value_counts().index[20]
df.time.value_counts().plot(kind = 'line', figsize = (15, 5))
xmax = df.time.value_counts().idxmax()
ymax = df.time.value_counts().max()
plt.vlines(x=xmax, ymin = 0, ymax = ymax, color='r')
plt.annotate('The presidential election \n 2018 Aug 20',
xytext=(pd.Timestamp('2007-06-01 00:00:00'), ymax),
xy=(xmax, ymax),
arrowprops=dict(facecolor='green', shrink=0.05),
fontsize = 20)
plt.show()
import folium #, jinja2, vincent
from folium import plugins
from IPython.display import IFrame
from IPython.core.display import HTML
print(folium.__file__)
print(folium.__version__)
nodes = [(df.Latitude[i], df.Longitude[i]) for i in df.index]
map_osm = folium.Map(location=[df.Latitude.mean(), df.Longitude.mean() ], tiles = 'cartodbpositron',
#tiles='stamentoner',
zoom_start=5.4)
map_osm.add_child(plugins.HeatMap(nodes))
map_osm.fit_bounds(map_osm.get_bounds())
map_osm
def year_map(year):
map_osm = folium.Map(location=[df.Latitude.mean(), df.Longitude.mean() ], tiles = 'cartodbpositron',
#tiles='stamentoner',
zoom_start=5.4)
locations = [(df.Latitude[i], df.Longitude[i]) for i in df[df.year == year].index]
# locations = nodes
# popups = ['{}'.format(loc) for loc in locations]
for coord in locations:
folium.CircleMarker( location=[ coord[0], coord[1] ] ,
radius=1,
color = 'red',
fill_color='red',
weight=.5).add_to( map_osm )
#Set the zoom to the maximum possible
map_osm.fit_bounds(map_osm.get_bounds())
return map_osm
for year in df.year.unique():
map_osm = year_map(year)
map_osm.save('../vis/Wikileaks_Afghanistan_Year_' + str(year)+ '.html')
map2004 = year_map(2004)
map2005 = year_map(2005)
map2004
map2005
import geopandas as gpd
# Read file using gpd.read_file()
country = gpd.GeoDataFrame.from_file('../data/afghanistan_district398.shp')
country.to_crs()
places.crs
country.plot(figsize = (15, 15), color = 'grey')
plt.show()
def plot_points_on_shapefile(year, ax):
from shapely.geometry import Point
import geopandas as gpd
country = gpd.GeoDataFrame.from_file('../data/afghanistan_district398.shp')
# https://esoc.princeton.edu/files/administrative-boundaries-398-districts
# Create a DataFrame with some cities, including their location
places = df[['TrackingNumber', 'Latitude', 'Longitude', 'year']][df.year == year]
# Create the geometry column from the coordinates
# Remember that longitude is east-west (i.e. X) and latitude is north-south (i.e. Y)
places["geometry"] = places.apply(lambda row: Point(row["Longitude"], row["Latitude"]), axis=1)
del(places["Latitude"], places["Longitude"], places["year"] )
# # Convert to a GeoDataFrame
places = gpd.GeoDataFrame(places, geometry="geometry")
# # Declare the coordinate system for the places GeoDataFrame
# # GeoPandas doesn't do any transformations automatically when performing
# # the spatial join. The layers are already in the same CRS (WGS84) so no
# # transformation is needed.
places.crs = {"init": "epsg:4326"}
country.crs = {"init": "epsg:4326"}
# Perform the spatial join
#result = gpd.tools.sjoin(places, country, how="left")
country.plot(ax=ax, color="#cccccc")
places.plot(ax=ax, markersize=5, color="#cc0000")
plt.axis('off')
plt.title(str(year))
import matplotlib.pyplot as plt
fig = plt.figure(figsize=(12, 8),facecolor='white')
year = [2004+i for i in range(6)]
for k, i in enumerate(year):
ax = fig.add_subplot(2,3,k+1)
plot_points_on_shapefile(i, ax)
plt.tight_layout()
# https://pan.baidu.com/s/1jKjBXOy
dfa = pd.read_excel('../data/wikileaks/Wikileaks Afghanistan war logs analysis.xlsx', skiprows = 1)
dfa.head()
{i:str(k+1) for k, i in enumerate(dfa.Month.unique())}
month_dic = {'April': '04',
'August': '08',
'December': '12',
'February': '02',
'January': '01',
'July': '07',
'June': '06',
'March': '03',
'May': '05',
'November': '11',
'October': '10',
'September': '09'}
# slice from the beginning to 'TOTAL'
dfa = dfa.loc[:, :'TOTAL']
dfa['month'] = [month_dic[i] for i in dfa.Month]
dfa['time'] = [str(dfa.Year[i])+dfa.month[i]+'01' for i in dfa.index]
dfa['time'] = pd.to_datetime(dfa.time, format = '%Y%m%d')
dfa.set_index("time", inplace=True)
dfa[['Explosion/Ambush', 'Found/cleared', 'TOTAL']].plot(figsize=(15, 5))
plt.show()