案例:2009年英国国会议员开支丑闻#
MPs expenses scandal
data-journalism/data-journalism.github.io#54
import pandas as pd
import pylab as plt
plt.rcParams['font.sans-serif'] = ['Microsoft YaHei'] # 用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False # 用来正常显示负号, 注意['SimHei']对应这句不行.
plt.style.use('ggplot')
ls './data/'
MPs' expenses claims, Jul-Dec, 2009.xlsx
data.js*
df = pd.read_excel("./data/MPs' expenses claims, Jul-Dec, 2009.xlsx")
df.head()
| Name of member | Allowance Type | Expenditure Type | Date | Amount, £ | |
|---|---|---|---|---|---|
| 0 | Adam Afriyie | Office running costs (IEP/AOE) | Incidentals | 2009-07-03 00:00:00 | 111.09 |
| 1 | Adam Afriyie | Office running costs (IEP/AOE) | Incidentals | 2009-11-05 00:00:00 | 111.09 |
| 2 | Adam Afriyie | Office running costs (IEP/AOE) | Incidentals | 2009-11-05 00:00:00 | 51.75 |
| 3 | Adam Holloway | Communications Expenditure | Delivery charges | 2009-11-10 00:00:00 | 258.22 |
| 4 | Adam Holloway | Communications Expenditure | Delivery charges | 2009-12-10 00:00:00 | 879.66 |
df.columns
Index(['Name of member', 'Allowance Type', 'Expenditure Type', 'Date',
'Amount, £'],
dtype='object')
len(df)
38925
df.describe()
| Name of member | Allowance Type | Expenditure Type | Date | Amount, £ | |
|---|---|---|---|---|---|
| count | 38925 | 38925 | 38925 | 38925 | 38925 |
| unique | 643 | 5 | 39 | 140 | 14441 |
| top | Danny Alexander | Office running costs (IEP/AOE) | Incidentals | 2009-12-21 00:00:00 | 50 |
| freq | 173 | 25787 | 8672 | 828 | 857 |
清洗数据#
df[df['Amount, £']=='Amount']
| Name of member | Allowance Type | Expenditure Type | Date | Amount, £ | |
|---|---|---|---|---|---|
| 27710 | Member Name | Allowance Type | Expenditure Type | Date | Amount |
df = df[df['Amount, £']!='Amount']
df[df['Name of member']=='John Randall']
| Name of member | Allowance Type | Expenditure Type | Date | Amount, £ | |
|---|---|---|---|---|---|
| 22221 | John Randall | Office running costs (IEP/AOE) | Accommodation costs | 2009-08-20 00:00:00 | -2134 |
| 22222 | John Randall | Office running costs (IEP/AOE) | Other equipment lease | 2009-07-01 00:00:00 | 25.08 |
df['Amount, £'] = [abs(float(i)) for i in df['Amount, £']]
描述数据#
df[['Name of member', 'Amount, £']].groupby("Name of member").agg('sum')
| Amount, £ | |
|---|---|
| Name of member | |
| Adam Afriyie | 273.93 |
| Adam Holloway | 7415.06 |
| Adam Ingram | 18578.90 |
| Adam Price | 7874.73 |
| Adrian Bailey | 10668.61 |
| ... | ... |
| William Hague | 7370.98 |
| William McCrea | 24689.34 |
| William Rennie | 27257.33 |
| Willie Bain | 822 |
| Yvette Cooper | 21450.30 |
643 rows × 1 columns
dat1 = df[['Name of member', 'Amount, £']].groupby("Name of member").agg('sum')
dat1 = dat1.sort_values(by=['Amount, £'], ascending = False)
dat1[:10]
| Amount, £ | |
|---|---|
| Name of member | |
| Graham Allen | 42753.53 |
| Michael Foster | 39999.69 |
| Mark Hunter | 34779.11 |
| Gordon Banks | 34471.90 |
| Lynda Waltho | 34288.55 |
| Joan Humble | 33395.89 |
| Phil Wilson | 33031.23 |
| Barbara Follett | 32861.14 |
| Sian James | 32495.32 |
| Helen Jones | 32209.45 |
dat1['Rank'] = range(1, len(dat1)+1)
dat1[:10]
| Amount, £ | Rank | |
|---|---|---|
| Name of member | ||
| Graham Allen | 42753.53 | 1 |
| Michael Foster | 39999.69 | 2 |
| Mark Hunter | 34779.11 | 3 |
| Gordon Banks | 34471.90 | 4 |
| Lynda Waltho | 34288.55 | 5 |
| Joan Humble | 33395.89 | 6 |
| Phil Wilson | 33031.23 | 7 |
| Barbara Follett | 32861.14 | 8 |
| Sian James | 32495.32 | 9 |
| Helen Jones | 32209.45 | 10 |
plt.figure(figsize =(16, 6), dpi = 100)
plt.plot(dat1['Rank'], dat1['Amount, £'], 'ro')
# plt.yscale('log')
# plt.xscale('log')
plt.xlabel('Rank')
plt.ylabel('Amount, £')
plt.show()
plt.figure(figsize =(16, 6), dpi = 100)
dat10 = dat1[:10]
plt.plot(dat10['Rank'], dat10['Amount, £'], 'ro')
# plt.yscale('log')
# plt.xscale('log')
plt.xlabel('Rank')
plt.ylabel('Amount, £')
plt.show()
dat2 = df[['Allowance Type', 'Amount, £']].groupby("Allowance Type").agg('sum')
dat2 = dat2.sort_values(by=['Amount, £'], ascending = False)
dat2
| Amount, £ | |
|---|---|
| Allowance Type | |
| Office running costs (IEP/AOE) | 4561001.11 |
| Cost of staying away from the main home (ACA/PAAE) | 3160432.72 |
| Communications Expenditure | 2510511.02 |
| Staffing non-payroll | 24414.17 |
dat3 = df[['Expenditure Type', 'Amount, £']].groupby("Expenditure Type").agg('sum')
dat3 = dat3.sort_values(by=['Amount, £'], ascending = False)
dat3
| Amount, £ | |
|---|---|
| Expenditure Type | |
| Accommodation costs | 1809184.32 |
| Publications | 1097722.44 |
| Mortgage interest | 1056568.92 |
| Delivery charges | 1019891.50 |
| Rent | 969231.40 |
| Incidentals | 806222.41 |
| Telephones | 677887.23 |
| Other household costs | 463874.76 |
| Subsistence | 373680.38 |
| Advertising | 261207.10 |
| Council tax | 226974.67 |
| Agencies etc | 220883.47 |
| Constituency Association payments | 187676.45 |
| Utilities | 172737.37 |
| Other equipment lease | 141508.52 |
| Websites | 131689.98 |
| IT equipment purchase | 117466.40 |
| Professional fees | 90381.10 |
| Other equipment purchase | 82802.68 |
| Maintenance, janitorial | 79518.88 |
| Hotel Expenses | 70102.59 |
| Petty cash | 47442.51 |
| Payments to individuals | 42585.55 |
| Member travel (AOE) | 31841.92 |
| Staff Subsistence | 24401.87 |
| Staff office purchases | 22137.03 |
| Staff travel rail | 8195.26 |
| Staff telephone expenses | 6965.79 |
| IT equipment lease | 4382.88 |
| Staff subsistence | 4071.69 |
| Staff travel car | 2888.88 |
| Staff use of home expenses | 1969.68 |
| Staff travel air | 800.41 |
| Staff travel parking | 697.74 |
| Staff travel taxi | 521.54 |
| Staff travel public transport | 190.85 |
| General Running Costs | 40.55 |
| Staff travel tolls | 12.30 |
plt.figure(figsize =(16, 4), dpi = 100)
dat3['Amount, £'].plot(kind = 'bar')
plt.yscale('log')
plt.show()
桑基图#
from pyecharts import options as opts
from pyecharts.charts import Sankey
nodes = [
{"name": "category1"},
{"name": "category2"},
{"name": "category3"},
{"name": "category4"},
{"name": "category5"},
{"name": "category6"},
]
links = [
{"source": "category1", "target": "category2", "value": 10},
{"source": "category2", "target": "category3", "value": 15},
{"source": "category3", "target": "category4", "value": 20},
{"source": "category5", "target": "category6", "value": 25},
]
c = (
Sankey()
.add(
"sankey",
nodes,
links,
linestyle_opt=opts.LineStyleOpts(opacity=0.2, curve=0.5, color="source"),
label_opts=opts.LabelOpts(position="right"),
)
.set_global_opts(title_opts=opts.TitleOpts(title="Sankey-基本示例"))
#.render("sankey_base.html")
.render_notebook()
)
c
dat4 = df[['Allowance Type','Expenditure Type', 'Amount, £']].groupby(['Expenditure Type', 'Allowance Type'], as_index = False).agg('sum')
dat4 = dat4.sort_values(by=['Amount, £'], ascending = False)
dat4
| Expenditure Type | Allowance Type | Amount, £ | |
|---|---|---|---|
| 0 | Accommodation costs | Office running costs (IEP/AOE) | 1809184.32 |
| 20 | Publications | Communications Expenditure | 1097722.44 |
| 13 | Mortgage interest | Cost of staying away from the main home (ACA/P... | 1056568.92 |
| 5 | Delivery charges | Communications Expenditure | 1019891.50 |
| 21 | Rent | Cost of staying away from the main home (ACA/P... | 969231.40 |
| 10 | Incidentals | Office running costs (IEP/AOE) | 806222.41 |
| 35 | Telephones | Office running costs (IEP/AOE) | 677887.23 |
| 16 | Other household costs | Cost of staying away from the main home (ACA/P... | 463874.76 |
| 34 | Subsistence | Cost of staying away from the main home (ACA/P... | 373680.38 |
| 1 | Advertising | Communications Expenditure | 261207.10 |
| 4 | Council tax | Cost of staying away from the main home (ACA/P... | 226974.67 |
| 2 | Agencies etc | Office running costs (IEP/AOE) | 220883.47 |
| 3 | Constituency Association payments | Office running costs (IEP/AOE) | 187676.45 |
| 36 | Utilities | Office running costs (IEP/AOE) | 172737.37 |
| 14 | Other equipment lease | Office running costs (IEP/AOE) | 141508.52 |
| 37 | Websites | Communications Expenditure | 131689.98 |
| 9 | IT equipment purchase | Office running costs (IEP/AOE) | 117466.40 |
| 19 | Professional fees | Office running costs (IEP/AOE) | 90381.10 |
| 15 | Other equipment purchase | Office running costs (IEP/AOE) | 82802.68 |
| 11 | Maintenance, janitorial | Office running costs (IEP/AOE) | 79518.88 |
| 7 | Hotel Expenses | Cost of staying away from the main home (ACA/P... | 70102.59 |
| 18 | Petty cash | Office running costs (IEP/AOE) | 47442.51 |
| 17 | Payments to individuals | Office running costs (IEP/AOE) | 42585.55 |
| 12 | Member travel (AOE) | Office running costs (IEP/AOE) | 31841.92 |
| 22 | Staff Subsistence | Staffing non-payroll | 24401.87 |
| 23 | Staff office purchases | Office running costs (IEP/AOE) | 22137.03 |
| 30 | Staff travel rail | Office running costs (IEP/AOE) | 8195.26 |
| 25 | Staff telephone expenses | Office running costs (IEP/AOE) | 6965.79 |
| 8 | IT equipment lease | Office running costs (IEP/AOE) | 4382.88 |
| 24 | Staff subsistence | Office running costs (IEP/AOE) | 4071.69 |
| 27 | Staff travel car | Office running costs (IEP/AOE) | 2888.88 |
| 33 | Staff use of home expenses | Office running costs (IEP/AOE) | 1969.68 |
| 26 | Staff travel air | Office running costs (IEP/AOE) | 800.41 |
| 28 | Staff travel parking | Office running costs (IEP/AOE) | 697.74 |
| 31 | Staff travel taxi | Office running costs (IEP/AOE) | 521.54 |
| 29 | Staff travel public transport | Office running costs (IEP/AOE) | 190.85 |
| 6 | General Running Costs | Office running costs (IEP/AOE) | 40.55 |
| 32 | Staff travel tolls | Staffing non-payroll | 12.30 |
nodes = dat4['Allowance Type'].unique().tolist() + dat4['Expenditure Type'].unique().tolist()
nodes = [{'name': i} for i in nodes]
links = [{'source': dat4['Allowance Type'][i], 'target': dat4['Expenditure Type'][i], 'value': dat4['Amount, £'][i]} for i in dat4.index]
from pyecharts import options as opts
from pyecharts.charts import Sankey
s = (
Sankey()
.add(
"sankey",
nodes,
links,
linestyle_opt=opts.LineStyleOpts(opacity=0.2, curve=0.5, color="source"),
label_opts=opts.LabelOpts(position="right"),
)
.set_global_opts(title_opts=opts.TitleOpts(title="Sankey Graph of UK MPS"))
)
s.render_notebook()
s.render("sankey_mps.html")
'/Users/datalab/github/data-journalism.github.io/notebook/sankey_mps.html'