In [13]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
In [28]:
print(plt.style.available)
['seaborn-dark', 'seaborn-darkgrid', 'seaborn-ticks', 'fivethirtyeight', 'seaborn-whitegrid', 'classic', 'seaborn-talk', 'seaborn-dark-palette', 'seaborn-bright', 'seaborn-pastel', 'grayscale', 'seaborn-notebook', 'ggplot', 'seaborn-colorblind', 'seaborn-muted', 'seaborn-paper', 'bmh', 'seaborn-white', 'dark_background', 'seaborn-poster', 'seaborn-deep']
In [47]:
sns.set_style("whitegrid") 
plt.style.use('seaborn-whitegrid')
In [170]:
# https://www.theguardian.com/world/datablog/2010/jul/26/wikileaks-afghanistan-ied-attacks
# or https://pan.baidu.com/s/1jKjBXOy
df = pd.read_excel('../data/wikileaks/ExplodedIED.xls')
df.head()
WARNING *** file size (8971777) not 512 + multiple of sector size (512)
Out[170]:
ReportKey DateOccurred Type Category TrackingNumber Title Summary Region AttackOn ComplexAttack ... EnemyDetained MGRS Latitude Longitude OriginatorGroup UpdatedByGroup CCIR Sigact Affiliation DColor
0 7AEE1FD6-488D-4351-A7DC-16A627ACDE9A 2004-01-04 00:00:00 Explosive Hazard IED Explosion 2007-033-005354-0981 (EXPLOSIVE HAZARD) IED EXPLOSION RPT (VBIED) ... 04 JAN: AT LEAST 15 DEAD IN TRUCK BOMB ATTACK ... RC SOUTH ENEMY 0.0 ... 0.0 41RQQ6257932849 31.000000 65.749992 UNKNOWN JTF Paladin 3rd EOD BN S-3 NaN NaN ENEMY RED
1 99AA5C2F-2129-4200-AB73-AA29D94E0D83 2004-01-06 00:00:00 Explosive Hazard IED Explosion 2007-033-005356-0059 (EXPLOSIVE HAZARD) IED EXPLOSION RPT (UNK) OT... EXPLOSION KILLS 50 NEAR QABA MOSQUE IN SPIN BO... RC SOUTH ENEMY 0.0 ... 0.0 42RTV5174832504 31.000000 66.400002 UNKNOWN JTF Paladin 3rd EOD BN S-3 NaN NaN ENEMY RED
2 F0E77B28-FDB5-44A4-ABC1-1D21052AA981 2004-01-06 00:00:00 Explosive Hazard IED Explosion 2007-033-005356-0169 (EXPLOSIVE HAZARD) IED EXPLOSION RPT (UNK) OT... ATTACK BY ACM IN QALAT KILLED 27TH DIV. CMD. A... RC SOUTH ENEMY 0.0 ... 0.0 42SUA0184053450 32.099991 66.900002 UNKNOWN JTF Paladin 3rd EOD BN S-3 NaN NaN ENEMY RED
3 D61E2F11-3079-4C5B-A37F-23EA50DCBA31 2004-01-25 00:00:00 Explosive Hazard IED Explosion 2007-033-005407-0968 (EXPLOSIVE HAZARD) IED EXPLOSION RPT (UNK) US... CJSOTF-A REPORTS USMC PATROL AMBUSHED IVO 42SX... RC EAST ENEMY 1.0 ... 0.0 42SXD9410056091 34.828609 71.122498 UNKNOWN J3 ORSA NaN NaN ENEMY RED
4 6B9D2340-1AA2-418A-BFA6-F2F953E52FC5 2004-01-26 00:00:00 Explosive Hazard IED Explosion 2007-033-005353-0543 IED CF Other 2 CF WIA (S//REL GCTF) CJSOTF-A REPORTS IED STRIKE FOLL... RC EAST ENEMY 0.0 ... 0.0 42SXD9431252088 34.792500 71.123894 UNKNOWN UNKNOWN NaN NaN ENEMY RED

5 rows × 31 columns

In [171]:
len(df)
Out[171]:
7527
In [258]:
df['time'] = [str(i)[:8]+'01' for i in df.DateOccurred]
df['time'] = [i if '200' in i else np.nan for i in df.time  ]
df['time'] = pd.to_datetime(df.time, format = '%Y-%m-%d')
df['year'] = [str(i)[:4] for i in df.DateOccurred]
df['year'] = [int(i) if '200' in i else np.nan for i in df.year]
df = df.dropna(subset = ['Latitude','Longitude'])
len(df)
Out[258]:
7346
In [266]:
df.columns
Out[266]:
Index(['ReportKey', 'DateOccurred', 'Type', 'Category', 'TrackingNumber',
       'Title', 'Summary', 'Region', 'AttackOn', 'ComplexAttack',
       'ReportingUnit', 'UnitName', 'TypeOfUnit', 'FriendlyWIA', 'FriendlyKIA',
       'HostNationWIA', 'HostNationKIA', 'CivilianWIA', 'CivilianKIA',
       'EnemyWIA', 'EnemyKIA', 'EnemyDetained', 'MGRS', 'Latitude',
       'Longitude', 'OriginatorGroup', 'UpdatedByGroup', 'CCIR', 'Sigact',
       'Affiliation', 'DColor', 'time', 'year'],
      dtype='object')
In [174]:
df.iloc[0]
Out[174]:
ReportKey                       7AEE1FD6-488D-4351-A7DC-16A627ACDE9A
DateOccurred                                     2004-01-04 00:00:00
Type                                                Explosive Hazard
Category                                               IED Explosion
TrackingNumber                                  2007-033-005354-0981
Title              (EXPLOSIVE HAZARD) IED EXPLOSION RPT  (VBIED) ...
Summary            04 JAN: AT LEAST 15 DEAD IN TRUCK BOMB ATTACK ...
Region                                                      RC SOUTH
AttackOn                                                       ENEMY
ComplexAttack                                                      0
ReportingUnit                                                  OTHER
UnitName                                                       OTHER
TypeOfUnit                                             None Selected
FriendlyWIA                                                        0
FriendlyKIA                                                        0
HostNationWIA                                                      0
HostNationKIA                                                      0
CivilianWIA                                                        0
CivilianKIA                                                        0
EnemyWIA                                                           0
EnemyKIA                                                           0
EnemyDetained                                                      0
MGRS                                                 41RQQ6257932849
Latitude                                                          31
Longitude                                                      65.75
OriginatorGroup                                              UNKNOWN
UpdatedByGroup                            JTF Paladin 3rd EOD BN S-3
CCIR                                                             NaN
Sigact                                                           NaN
Affiliation                                                    ENEMY
DColor                                                           RED
Name: 0, dtype: object
In [175]:
plt.figure(figsize = (8, 8))
region_freq = df.groupby('Region').size()
explode = (0.2, 0.1, 0.2, 0.1, 0.2, 0.3)  # only "explode" the 2nd slice (i.e. 'Hogs')
plt.pie(region_freq, labels = region_freq.index, explode = explode, 
        autopct='%0.1f%%', 
        pctdistance=0.2, shadow=True)
plt.show()
In [383]:
df.Region.unique()
Out[383]:
array(['RC SOUTH', 'RC EAST', 'RC CAPITAL', 'RC NORTH', 'RC WEST',
       'UNKNOWN'], dtype=object)
In [382]:
df.Region.value_counts()
Out[382]:
RC SOUTH      3644
RC EAST       2869
RC WEST        356
RC NORTH       253
RC CAPITAL     215
UNKNOWN          9
Name: Region, dtype: int64
In [177]:
plt.figure(figsize = (8, 8))
explode = (0.2, 0.3)  

df.Category.value_counts().plot(kind='pie', explode = explode, 
        autopct='%0.1f%%', 
        pctdistance=0.2, shadow=True)
plt.show()
In [386]:
plt.figure(figsize = (8, 8))
explode = (0.1, 0.1, 0.2, 0.1, 0.2, 0.3)  

df.Region.value_counts().plot(kind='pie', explode = explode, 
        autopct='%0.1f%%', 
        pctdistance=.5, shadow=True)
plt.show()
In [430]:
 
Out[430]:
Timestamp('2007-08-01 00:00:00')
In [425]:
df.time.value_counts().index[20]
Out[425]:
Timestamp('2007-09-01 00:00:00')
In [444]:
df.time.value_counts().plot(kind = 'line', figsize = (15, 5))
xmax = df.time.value_counts().idxmax()
ymax = df.time.value_counts().max()
plt.vlines(x=xmax, ymin = 0, ymax = ymax, color='r')
plt.annotate('The presidential election \n 2018 Aug 20', 
             xytext=(pd.Timestamp('2007-06-01 00:00:00'), ymax),
             xy=(xmax, ymax),
            arrowprops=dict(facecolor='green', shrink=0.05),
            fontsize = 20)
plt.show()
In [180]:
import folium #, jinja2, vincent
from folium import plugins
from IPython.display import IFrame
from IPython.core.display import HTML
print(folium.__file__)
print(folium.__version__)
/Users/chengjun/anaconda3/lib/python3.6/site-packages/folium/__init__.py
0.5.0
In [257]:
nodes = [(df.Latitude[i], df.Longitude[i]) for i in df.index]
map_osm = folium.Map(location=[df.Latitude.mean(), df.Longitude.mean() ], tiles = 'cartodbpositron',
                     #tiles='stamentoner', 
                     zoom_start=5.4)
map_osm.add_child(plugins.HeatMap(nodes))
map_osm.fit_bounds(map_osm.get_bounds())
map_osm
Out[257]:
In [284]:
def year_map(year):
    map_osm = folium.Map(location=[df.Latitude.mean(), df.Longitude.mean() ], tiles = 'cartodbpositron',
                         #tiles='stamentoner', 
                         zoom_start=5.4)


    locations = [(df.Latitude[i], df.Longitude[i]) for i in df[df.year == year].index]

    # locations = nodes
    # popups = ['{}'.format(loc) for loc in locations]

    for coord in locations:
        folium.CircleMarker( location=[ coord[0], coord[1] ] , 
                            radius=1,
                            color = 'red',
                            fill_color='red',
                            weight=.5).add_to( map_osm )

    #Set the zoom to the maximum possible
    map_osm.fit_bounds(map_osm.get_bounds())
    return map_osm
In [289]:
for year in df.year.unique():
    map_osm = year_map(year)
    map_osm.save('../vis/Wikileaks_Afghanistan_Year_' + str(year)+ '.html')
In [285]:
map2004 = year_map(2004)
map2005 = year_map(2005)
In [286]:
map2004
Out[286]:
In [381]:
map2005
Out[381]:
In [290]:
import geopandas as gpd
In [331]:
# Read file using gpd.read_file()
country = gpd.GeoDataFrame.from_file('../data/afghanistan_district398.shp')
In [346]:
country.to_crs()
Out[346]:
{}
In [347]:
places.crs
Out[347]:
{'init': 'epsg:4326'}
In [332]:
country.plot(figsize = (15, 15), color = 'grey')
plt.show()
In [369]:
def plot_points_on_shapefile(year, ax):
    from shapely.geometry import Point
    import geopandas as gpd
    country = gpd.GeoDataFrame.from_file('../data/afghanistan_district398.shp')
    # https://esoc.princeton.edu/files/administrative-boundaries-398-districts
    # Create a DataFrame with some cities, including their location
    places = df[['TrackingNumber', 'Latitude', 'Longitude', 'year']][df.year == year]

    # Create the geometry column from the coordinates
    # Remember that longitude is east-west (i.e. X) and latitude is north-south (i.e. Y)
    places["geometry"] = places.apply(lambda row: Point(row["Longitude"], row["Latitude"]), axis=1)
    del(places["Latitude"], places["Longitude"], places["year"] )

    # # Convert to a GeoDataFrame
    places = gpd.GeoDataFrame(places, geometry="geometry")

    # # Declare the coordinate system for the places GeoDataFrame
    # # GeoPandas doesn't do any transformations automatically when performing
    # # the spatial join. The layers are already in the same CRS (WGS84) so no
    # # transformation is needed.
    places.crs = {"init": "epsg:4326"}
    country.crs = {"init": "epsg:4326"}

    # Perform the spatial join
    #result = gpd.tools.sjoin(places, country, how="left")
    country.plot(ax=ax, color="#cccccc")
    places.plot(ax=ax, markersize=5, color="#cc0000")
    plt.axis('off')
    plt.title(str(year))
In [379]:
import matplotlib.pyplot as plt
fig = plt.figure(figsize=(12, 8),facecolor='white')
year = [2004+i for i in range(6)]
for k, i in enumerate(year):
    ax = fig.add_subplot(2,3,k+1)
    plot_points_on_shapefile(i, ax)
plt.tight_layout()
In [130]:
# https://pan.baidu.com/s/1jKjBXOy
dfa = pd.read_excel('../data/wikileaks/Wikileaks Afghanistan war logs analysis.xlsx', skiprows = 1)
dfa.head()
Out[130]:
Year Month Explosion/Ambush Found/cleared TOTAL Unnamed: 5 Unnamed: 6 Year.1 Explosion/Ambush.1 Found/cleared.1
0 2004 January 5 6 11 NaN NaN 2004 Total 191 117.0
1 2004 February 7 1 8 NaN NaN 2005 Total 366 330.0
2 2004 March 7 2 9 NaN NaN 2006 Total 797 1762.0
3 2004 April 14 1 15 NaN NaN 2007 Total 1147 1005.0
4 2004 May 21 15 36 NaN NaN 2008 Total 1632 1633.0
In [131]:
{i:str(k+1) for k, i in enumerate(dfa.Month.unique())}
Out[131]:
{'April': '4',
 'August': '8',
 'December': '12',
 'February': '2',
 'January': '1',
 'July': '7',
 'June': '6',
 'March': '3',
 'May': '5',
 'November': '11',
 'October': '10',
 'September': '9'}
In [132]:
month_dic = {'April': '04',
 'August': '08',
 'December': '12',
 'February': '02',
 'January': '01',
 'July': '07',
 'June': '06',
 'March': '03',
 'May': '05',
 'November': '11',
 'October': '10',
 'September': '09'}
In [141]:
# slice from the beginning to 'TOTAL'
dfa = dfa.loc[:, :'TOTAL']
dfa['month'] = [month_dic[i] for i in dfa.Month]
dfa['time'] =  [str(dfa.Year[i])+dfa.month[i]+'01' for i in dfa.index]
dfa['time'] = pd.to_datetime(dfa.time, format = '%Y%m%d')
dfa.set_index("time", inplace=True)
In [147]:
dfa[['Explosion/Ambush', 'Found/cleared', 'TOTAL']].plot(figsize=(15, 5))
plt.show()