案例:2009年英国国会议员开支丑闻#

MPs expenses scandal

data-journalism/data-journalism.github.io#54

import pandas as pd

import pylab as plt

plt.rcParams['font.sans-serif'] = ['Microsoft YaHei']  # 用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False  # 用来正常显示负号, 注意['SimHei']对应这句不行.
plt.style.use('ggplot')
ls './data/'
MPs' expenses claims, Jul-Dec, 2009.xlsx
data.js*
df = pd.read_excel("./data/MPs' expenses claims, Jul-Dec, 2009.xlsx")
df.head()
Name of member Allowance Type Expenditure Type Date Amount, £
0 Adam Afriyie Office running costs (IEP/AOE) Incidentals 2009-07-03 00:00:00 111.09
1 Adam Afriyie Office running costs (IEP/AOE) Incidentals 2009-11-05 00:00:00 111.09
2 Adam Afriyie Office running costs (IEP/AOE) Incidentals 2009-11-05 00:00:00 51.75
3 Adam Holloway Communications Expenditure Delivery charges 2009-11-10 00:00:00 258.22
4 Adam Holloway Communications Expenditure Delivery charges 2009-12-10 00:00:00 879.66
df.columns
Index(['Name of member', 'Allowance Type', 'Expenditure Type', 'Date',
       'Amount, £'],
      dtype='object')
len(df)
38925
df.describe()
Name of member Allowance Type Expenditure Type Date Amount, £
count 38925 38925 38925 38925 38925
unique 643 5 39 140 14441
top Danny Alexander Office running costs (IEP/AOE) Incidentals 2009-12-21 00:00:00 50
freq 173 25787 8672 828 857

清洗数据#

df[df['Amount, £']=='Amount']
Name of member Allowance Type Expenditure Type Date Amount, £
27710 Member Name Allowance Type Expenditure Type Date Amount
df = df[df['Amount, £']!='Amount']
df[df['Name of member']=='John Randall']
Name of member Allowance Type Expenditure Type Date Amount, £
22221 John Randall Office running costs (IEP/AOE) Accommodation costs 2009-08-20 00:00:00 -2134
22222 John Randall Office running costs (IEP/AOE) Other equipment lease 2009-07-01 00:00:00 25.08
df['Amount, £'] = [abs(float(i)) for i in df['Amount, £']]

描述数据#

df[['Name of member', 'Amount, £']].groupby("Name of member").agg('sum')
Amount, £
Name of member
Adam Afriyie 273.93
Adam Holloway 7415.06
Adam Ingram 18578.90
Adam Price 7874.73
Adrian Bailey 10668.61
... ...
William Hague 7370.98
William McCrea 24689.34
William Rennie 27257.33
Willie Bain 822
Yvette Cooper 21450.30

643 rows × 1 columns

dat1 = df[['Name of member', 'Amount, £']].groupby("Name of member").agg('sum')
dat1 = dat1.sort_values(by=['Amount, £'], ascending = False)
dat1[:10]
Amount, £
Name of member
Graham Allen 42753.53
Michael Foster 39999.69
Mark Hunter 34779.11
Gordon Banks 34471.90
Lynda Waltho 34288.55
Joan Humble 33395.89
Phil Wilson 33031.23
Barbara Follett 32861.14
Sian James 32495.32
Helen Jones 32209.45
dat1['Rank'] = range(1, len(dat1)+1)
dat1[:10]
Amount, £ Rank
Name of member
Graham Allen 42753.53 1
Michael Foster 39999.69 2
Mark Hunter 34779.11 3
Gordon Banks 34471.90 4
Lynda Waltho 34288.55 5
Joan Humble 33395.89 6
Phil Wilson 33031.23 7
Barbara Follett 32861.14 8
Sian James 32495.32 9
Helen Jones 32209.45 10
plt.figure(figsize =(16, 6), dpi = 100)

plt.plot(dat1['Rank'], dat1['Amount, £'], 'ro')
# plt.yscale('log')
# plt.xscale('log')

plt.xlabel('Rank')
plt.ylabel('Amount, £')
plt.show()
_images/aa8544bbfc9cd56b86c98783e429a06fd607bcf0b86825c6b13f17afbe18d86f.png
plt.figure(figsize =(16, 6), dpi = 100)

dat10 = dat1[:10]
plt.plot(dat10['Rank'], dat10['Amount, £'], 'ro')
# plt.yscale('log')
# plt.xscale('log')

plt.xlabel('Rank')
plt.ylabel('Amount, £')
plt.show()
_images/aa5c82443c9ba5acf1d3745f35af209603667ecd0ec852d9ed6bbef3925ab28a.png
dat2 = df[['Allowance Type', 'Amount, £']].groupby("Allowance Type").agg('sum')
dat2 = dat2.sort_values(by=['Amount, £'], ascending = False)
dat2
Amount, £
Allowance Type
Office running costs (IEP/AOE) 4561001.11
Cost of staying away from the main home (ACA/PAAE) 3160432.72
Communications Expenditure 2510511.02
Staffing non-payroll 24414.17
dat3 = df[['Expenditure Type', 'Amount, £']].groupby("Expenditure Type").agg('sum')
dat3 = dat3.sort_values(by=['Amount, £'], ascending = False)
dat3
Amount, £
Expenditure Type
Accommodation costs 1809184.32
Publications 1097722.44
Mortgage interest 1056568.92
Delivery charges 1019891.50
Rent 969231.40
Incidentals 806222.41
Telephones 677887.23
Other household costs 463874.76
Subsistence 373680.38
Advertising 261207.10
Council tax 226974.67
Agencies etc 220883.47
Constituency Association payments 187676.45
Utilities 172737.37
Other equipment lease 141508.52
Websites 131689.98
IT equipment purchase 117466.40
Professional fees 90381.10
Other equipment purchase 82802.68
Maintenance, janitorial 79518.88
Hotel Expenses 70102.59
Petty cash 47442.51
Payments to individuals 42585.55
Member travel (AOE) 31841.92
Staff Subsistence 24401.87
Staff office purchases 22137.03
Staff travel rail 8195.26
Staff telephone expenses 6965.79
IT equipment lease 4382.88
Staff subsistence 4071.69
Staff travel car 2888.88
Staff use of home expenses 1969.68
Staff travel air 800.41
Staff travel parking 697.74
Staff travel taxi 521.54
Staff travel public transport 190.85
General Running Costs 40.55
Staff travel tolls 12.30
plt.figure(figsize =(16, 4), dpi = 100)
dat3['Amount, £'].plot(kind = 'bar')
plt.yscale('log')
plt.show()
_images/cf71dbce5da65d01b02d95e213598da2e124fab1773d48ab5b13f84e4fa7f12f.png

桑基图#

from pyecharts import options as opts
from pyecharts.charts import Sankey

nodes = [
    {"name": "category1"},
    {"name": "category2"},
    {"name": "category3"},
    {"name": "category4"},
    {"name": "category5"},
    {"name": "category6"},
]

links = [
    {"source": "category1", "target": "category2", "value": 10},
    {"source": "category2", "target": "category3", "value": 15},
    {"source": "category3", "target": "category4", "value": 20},
    {"source": "category5", "target": "category6", "value": 25},
]
c = (
    Sankey()
    .add(
        "sankey",
        nodes,
        links,
        linestyle_opt=opts.LineStyleOpts(opacity=0.2, curve=0.5, color="source"),
        label_opts=opts.LabelOpts(position="right"),
    )
    .set_global_opts(title_opts=opts.TitleOpts(title="Sankey-基本示例"))
    #.render("sankey_base.html")
    .render_notebook()
)

c
dat4 = df[['Allowance Type','Expenditure Type', 'Amount, £']].groupby(['Expenditure Type', 'Allowance Type'],  as_index = False).agg('sum')
dat4 = dat4.sort_values(by=['Amount, £'], ascending = False)
dat4
Expenditure Type Allowance Type Amount, £
0 Accommodation costs Office running costs (IEP/AOE) 1809184.32
20 Publications Communications Expenditure 1097722.44
13 Mortgage interest Cost of staying away from the main home (ACA/P... 1056568.92
5 Delivery charges Communications Expenditure 1019891.50
21 Rent Cost of staying away from the main home (ACA/P... 969231.40
10 Incidentals Office running costs (IEP/AOE) 806222.41
35 Telephones Office running costs (IEP/AOE) 677887.23
16 Other household costs Cost of staying away from the main home (ACA/P... 463874.76
34 Subsistence Cost of staying away from the main home (ACA/P... 373680.38
1 Advertising Communications Expenditure 261207.10
4 Council tax Cost of staying away from the main home (ACA/P... 226974.67
2 Agencies etc Office running costs (IEP/AOE) 220883.47
3 Constituency Association payments Office running costs (IEP/AOE) 187676.45
36 Utilities Office running costs (IEP/AOE) 172737.37
14 Other equipment lease Office running costs (IEP/AOE) 141508.52
37 Websites Communications Expenditure 131689.98
9 IT equipment purchase Office running costs (IEP/AOE) 117466.40
19 Professional fees Office running costs (IEP/AOE) 90381.10
15 Other equipment purchase Office running costs (IEP/AOE) 82802.68
11 Maintenance, janitorial Office running costs (IEP/AOE) 79518.88
7 Hotel Expenses Cost of staying away from the main home (ACA/P... 70102.59
18 Petty cash Office running costs (IEP/AOE) 47442.51
17 Payments to individuals Office running costs (IEP/AOE) 42585.55
12 Member travel (AOE) Office running costs (IEP/AOE) 31841.92
22 Staff Subsistence Staffing non-payroll 24401.87
23 Staff office purchases Office running costs (IEP/AOE) 22137.03
30 Staff travel rail Office running costs (IEP/AOE) 8195.26
25 Staff telephone expenses Office running costs (IEP/AOE) 6965.79
8 IT equipment lease Office running costs (IEP/AOE) 4382.88
24 Staff subsistence Office running costs (IEP/AOE) 4071.69
27 Staff travel car Office running costs (IEP/AOE) 2888.88
33 Staff use of home expenses Office running costs (IEP/AOE) 1969.68
26 Staff travel air Office running costs (IEP/AOE) 800.41
28 Staff travel parking Office running costs (IEP/AOE) 697.74
31 Staff travel taxi Office running costs (IEP/AOE) 521.54
29 Staff travel public transport Office running costs (IEP/AOE) 190.85
6 General Running Costs Office running costs (IEP/AOE) 40.55
32 Staff travel tolls Staffing non-payroll 12.30
nodes = dat4['Allowance Type'].unique().tolist() + dat4['Expenditure Type'].unique().tolist()
nodes = [{'name': i} for i in nodes]
links = [{'source': dat4['Allowance Type'][i], 'target': dat4['Expenditure Type'][i], 'value': dat4['Amount, £'][i]}  for i in dat4.index]
from pyecharts import options as opts
from pyecharts.charts import Sankey

s = (
    Sankey()
    .add(
        "sankey",
        nodes,
        links,
        linestyle_opt=opts.LineStyleOpts(opacity=0.2, curve=0.5, color="source"),
        label_opts=opts.LabelOpts(position="right"),
    )
    .set_global_opts(title_opts=opts.TitleOpts(title="Sankey Graph of UK MPS"))
)

s.render_notebook()
s.render("sankey_mps.html")
'/Users/datalab/github/data-journalism.github.io/notebook/sankey_mps.html'

https://data-journalism.github.io/notebook/sankey_mps.html