使用NetworkX分析网络

使用NetworkX分析网络#

NetworkX is a Python package for the creation, manipulation, and study of the structure, dynamics, and functions of complex networks.

https://networkx.github.io/documentation/stable/tutorial.html

%matplotlib inline
import networkx as nx
import matplotlib.cm as cm
import matplotlib.pyplot as plt

import networkx as nx

G=nx.Graph() # G = nx.DiGraph() # 有向网络
# 添加（孤立）节点
G.add_node("spam")
# 添加节点和链接
G.add_edge(1,2)

print(G.nodes())

print(G.edges())

['spam', 1, 2]
[(1, 2)]

# 绘制网络
nx.draw(G, with_labels = True)

_images/755e493eb56b6301b5d9e0dcf17ab8d1195f44c01c46642cd08079dfa737acaa.png

WWW Data download

https://snap.stanford.edu/data/web-NotreDame.html

上面的数据也可以在Barabasi的networksciencebook网页下载 http://networksciencebook.com/translations/en/resources/data.html

World-Wide-Web: [README] [DATA] Réka Albert, Hawoong Jeong and Albert-László Barabási: Diameter of the World Wide Web Nature 401, 130 (1999) [ PDF ]

作业

下载www数据
构建networkx的网络对象g（提示：有向网络）
将www数据添加到g当中
计算网络中的节点数量和链接数量

G = nx.Graph()
n = 0
with open ('/Users/datalab/bigdata/cjc/www.dat.gz.txt') as f:
    for line in f:
        n += 1
        #if n % 10**4 == 0:
            #flushPrint(n)
        x, y = line.rstrip().split(' ')
        G.add_edge(x,y)
        

nx.info(G)

'Graph with 325729 nodes and 1117563 edges'

描述网络#

nx.karate_club_graph#

我们从karate_club_graph开始，探索网络的基本性质。

G = nx.karate_club_graph()
 
clubs = [G.nodes[i]['club'] for i in G.nodes()]
colors = []
for j in clubs:
    if j == 'Mr. Hi':
        colors.append('r')
    else:
        colors.append('g')
 
nx.draw(G,  with_labels = True, node_color = colors)

_images/f9929b98dccb6eb2c56ae5baff27f2a58b1a89041b64f2f10304f6fa1c507015.png

G.nodes[1], G.nodes[9] # 节点1的属性 # 节点1的属性

({'club': 'Mr. Hi'}, {'club': 'Officer'})

G.edges# 前三条边的id
#dir(G)

EdgeView([(0, 1), (0, 2), (0, 3), (0, 4), (0, 5), (0, 6), (0, 7), (0, 8), (0, 10), (0, 11), (0, 12), (0, 13), (0, 17), (0, 19), (0, 21), (0, 31), (1, 2), (1, 3), (1, 7), (1, 13), (1, 17), (1, 19), (1, 21), (1, 30), (2, 3), (2, 7), (2, 8), (2, 9), (2, 13), (2, 27), (2, 28), (2, 32), (3, 7), (3, 12), (3, 13), (4, 6), (4, 10), (5, 6), (5, 10), (5, 16), (6, 16), (8, 30), (8, 32), (8, 33), (9, 33), (13, 33), (14, 32), (14, 33), (15, 32), (15, 33), (18, 32), (18, 33), (19, 33), (20, 32), (20, 33), (22, 32), (22, 33), (23, 25), (23, 27), (23, 29), (23, 32), (23, 33), (24, 25), (24, 27), (24, 31), (25, 31), (26, 29), (26, 33), (27, 33), (28, 31), (28, 33), (29, 32), (29, 33), (30, 32), (30, 33), (31, 32), (31, 33), (32, 33)])

nx.info(G)

'Graph named "Zachary\'s Karate Club" with 34 nodes and 78 edges'

G.nodes()

NodeView((0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33))

list(G.edges())[:3]

[(0, 1), (0, 2), (0, 3)]

print(*G.neighbors(1))

0 2 3 7 13 17 19 21 30

nx.average_shortest_path_length(G) 

2.408199643493761

网络直径#

nx.diameter(G)#返回图G的直径（最长最短路径的长度）

密度#

nx.density(G)

0.13903743315508021

nodeNum = len(G.nodes())
edgeNum = len(G.edges())

2.0*edgeNum/(nodeNum * (nodeNum - 1))

0.13903743315508021

作业

计算www网络的网络密度

聚集系数#

cc = nx.clustering(G)
cc.items()

dict_items([(0, 0.15), (1, 0.3333333333333333), (2, 0.24444444444444444), (3, 0.6666666666666666), (4, 0.6666666666666666), (5, 0.5), (6, 0.5), (7, 1.0), (8, 0.5), (9, 0), (10, 0.6666666666666666), (11, 0), (12, 1.0), (13, 0.6), (14, 1.0), (15, 1.0), (16, 1.0), (17, 1.0), (18, 1.0), (19, 0.3333333333333333), (20, 1.0), (21, 1.0), (22, 1.0), (23, 0.4), (24, 0.3333333333333333), (25, 0.3333333333333333), (26, 1.0), (27, 0.16666666666666666), (28, 0.3333333333333333), (29, 0.6666666666666666), (30, 0.5), (31, 0.2), (32, 0.19696969696969696), (33, 0.11029411764705882)])

plt.hist(cc.values(), bins = 15)
plt.xlabel('$Clustering \, Coefficient, \, C$', fontsize = 20)
plt.ylabel('$Frequency, \, F$', fontsize = 20)
plt.show()

_images/53246658f4160517c562d8af60dd0be0dfb5d76928c6ca8b8f4a173619a7f705.png

Spacing in Math Mode#

In a math environment, LaTeX ignores the spaces you type and puts in the spacing that it thinks is best. LaTeX formats mathematics the way it’s done in mathematics texts. If you want different spacing, LaTeX provides the following four commands for use in math mode:

; - a thick space

: - a medium space

, - a thin space

\! - a negative thin space

匹配系数#

Assortativity measures the similarity of connections in the graph with respect to the node degree.

M. E. J. Newman, Mixing patterns in networks, Physical Review E, 67 026126, 2003

# M. E. J. Newman, Mixing patterns in networks Physical Review E, 67 026126, 2003
nx.degree_assortativity_coefficient(G) #计算一个图的度匹配性。

-0.47561309768461413

Ge=nx.Graph()
Ge.add_nodes_from([0,1],size=2)
Ge.add_nodes_from([2,3],size=3)
Ge.add_edges_from([(0,1),(2,3)])
node_size = [list(Ge.nodes[i].values())[0]*1000 for i in Ge.nodes()]
nx.draw(Ge, with_labels = True, node_size = node_size)

print(nx.numeric_assortativity_coefficient(Ge,'size')) 

1.0

_images/6060fc4883886fb1bacffa38b20b524b93a8bc18956b657caac18de1e34a9be0.png

# plot degree correlation  
from collections import defaultdict
import numpy as np

l=defaultdict(list)
g = nx.karate_club_graph()

for i in g.nodes():
    k = []
    for j in g.neighbors(i):
        k.append(g.degree(j))
    l[g.degree(i)].append(np.mean(k))   
    #l.append([g.degree(i),np.mean(k)])
  
x = list(l.keys())
y = [np.mean(i) for i in l.values()]

#x, y = np.array(l).T
plt.plot(x, y, 'ro', label = '$Karate\;Club$')
plt.legend(loc=1,fontsize=10, numpoints=1)
plt.xscale('log'); plt.yscale('log')
plt.ylabel(r'$<knn(k)$> ', fontsize = 20)
plt.xlabel('$k$', fontsize = 20)
plt.show()

_images/b740cdc5dabfaf02ed77e827941c0477b8dab480a2d89723f4fbb9cbd91eff1b.png

Degree centrality measures.（度中心性）#

degree_centrality(G) # Compute the degree centrality for nodes.
in_degree_centrality(G) # Compute the in-degree centrality for nodes.
out_degree_centrality(G) # Compute the out-degree centrality for nodes.
closeness_centrality(G[, v, weighted_edges]) # Compute closeness centrality for nodes.
betweenness_centrality(G[, normalized, …]) # Betweenness centrality measures.（介数中心性）

degree_centrality(G)

The degree centrality values are normalized by dividing by the maximum possible degree.

The maximum possible degreein a simple graph is n-1 where n is the number of nodes in G.

closeness_centrality(G[, v, weighted_edges])

Closeness centrality of a node u is the reciprocal of the average shortest path distance to u over all n-1 reachable nodes.

\[C(u) = \frac{n - 1}{\sum_{v=1}^{n-1} d(v, u)}\]

For graphs with more than one connected component:

\[C_{WF}(u) = \frac{n-1}{N-1} \frac{n - 1}{\sum_{v=1}^{n-1} d(v, u)}\]

Letting N denote the number of nodes in the graph

Linton C. Freeman: Centrality in networks: I. Conceptual clarification. Social Networks 1:215-239, 1979.
Wasserman, S. and Faust, K., Social Network Analysis: Methods and Applications, 1994, Cambridge University Press.pp.201.

betweenness_centrality(G)

Compute the shortest-path betweenness centrality for nodes.

Betweenness centrality of a node 𝑣 is the sum of the fraction of all-pairs shortest paths that pass through 𝑣

\[c_B(v) =\sum_{s,t \in V} \frac{\sigma(s, t|v)}{\sigma(s, t)}\]

s, t denote a node
𝑉 is the set of nodes
𝜎(𝑠,𝑡) is the number of shortest (𝑠,𝑡)-paths
𝜎(𝑠,𝑡|𝑣) is the number of those paths passing through some node 𝑣 other than 𝑠,𝑡.

dc = nx.degree_centrality(G)
closeness = nx.closeness_centrality(G)
betweenness= nx.betweenness_centrality(G)

fig = plt.figure(figsize=(15, 4),facecolor='white')
ax = plt.subplot(1, 3, 1)
plt.hist(dc.values(), bins = 20)
plt.xlabel('$Degree \, Centrality$', fontsize = 20)
plt.ylabel('$Frequency, \, F$', fontsize = 20)

ax = plt.subplot(1, 3, 2)
plt.hist(closeness.values(), bins = 20)
plt.xlabel('$Closeness \, Centrality$', fontsize = 20)

ax = plt.subplot(1, 3, 3)
plt.hist(betweenness.values(), bins = 20)
plt.xlabel('$Betweenness \, Centrality$', fontsize = 20)
plt.tight_layout()
plt.show() 

_images/419996f22b8934e1f649d46bea89ab9442c78ced3fd7bfa29293ea46ce39e4c5.png

fig = plt.figure(figsize=(15, 8),facecolor='white')
plt.style.use('ggplot')

for k in betweenness:
    plt.scatter(dc[k], closeness[k], s = betweenness[k]*10000)
    plt.text(dc[k], closeness[k]+0.02, str(k))
plt.xlabel('$Degree \, Centrality$', fontsize = 20)
plt.ylabel('$Closeness \, Centrality$', fontsize = 20)
plt.show()

_images/9cf6de87a0df0497086e67397a99e5ac32aa2412c5176af45719ab3461ea434f.png

度分布#

from collections import defaultdict
import numpy as np

def plotDegreeDistribution(G):
    degs = defaultdict(int)
    for i in dict(G.degree()).values(): degs[i]+=1
    items = sorted ( degs.items () )
    x, y = np.array(items).T
    y_sum = np.sum(y)
    y = [float(i)/y_sum for i in y]
    plt.plot(x, y, 'b-o')
    plt.xscale('log')
    plt.yscale('log')
    plt.legend(['Degree'])
    plt.xlabel('$K$', fontsize = 20)
    plt.ylabel('$P(K)$', fontsize = 20)
    plt.title('$Degree\,Distribution$', fontsize = 20)
    plt.show()   
    
G = nx.karate_club_graph()     
plotDegreeDistribution(G)

_images/319e235cdfa866223ff0fec333e7031f0dc5f3ab516a10dbe7c1e2ee4dfbf1ce.png

分析网络结构#

规则网络#

import networkx as nx
import matplotlib.pyplot as plt
RG = nx.random_graphs.random_regular_graph(3,200)  
#生成包含200个节点、 每个节点有3个邻居的规则图RG
pos = nx.spectral_layout(RG)          
#定义一个布局，此处采用了spectral布局方式，后变还会介绍其它布局方式，注意图形上的区别
nx.draw(RG,pos,with_labels=False,node_size = range(1, 201))  
#绘制规则图的图形，with_labels决定节点是非带标签（编号），node_size是节点的直径
plt.show()  #显示图形 

_images/f4c3205f1ca1a4c61567910ea8146e463a9fdc359bdb704e7578583229837f3e.png

plotDegreeDistribution(RG)

_images/05dd4d260b81ed92fbf7dd540b76e409a53a13b9e79d7271e99b88f28ed812b5.png

ER随机网络#

import networkx as nx
import matplotlib.pyplot as plt
ER = nx.random_graphs.erdos_renyi_graph(1000,0.01)  
#生成包含20个节点、以概率0.2连接的随机图
pos = nx.spring_layout(ER)          
#定义一个布局，此处采用了shell布局方式
nx.draw(ER,pos,with_labels=False,node_size = 30) 
plt.show()

_images/05482c17322366eb49fb1cafc8ec498a24fcb77f4574f1ddae3cc58fd837185f.png

ER = nx.random_graphs.erdos_renyi_graph(1000,0.01)  
plotDegreeDistribution(ER)

_images/e188de5800caa101b5374a5c09165be035360a80ce132c58261e5304291d5513.png

小世界网络#

import networkx as nx
import matplotlib.pyplot as plt
WS = nx.random_graphs.watts_strogatz_graph(500,4,0.1)  
#生成包含200个节点、每个节点4个近邻、随机化重连概率为0.1的小世界网络
pos = nx.spring_layout(WS)          
#定义一个布局，此处采用了circular布局方式
nx.draw(WS,pos,with_labels=False,node_size = 30)  
#绘制图形
plt.show()

_images/d79cdfd80d7db264f5aa08996b811e78dbb2ba99e2e334a8c3f595c2eeda0a7c.png

plotDegreeDistribution(WS)

_images/78343454e411152250b5d3e2eb2627159a4e154ff830e20549c8fee6506e1421.png

nx.diameter(WS)

cc = nx.clustering(WS)
plt.hist(cc.values(), bins = 10)
plt.xlabel('$Clustering \, Coefficient, \, C$', fontsize = 20)
plt.ylabel('$Frequency, \, F$', fontsize = 20)
plt.show()

_images/b1e3a6fe50b5b64f31bc38dc6c426e4f29e921d757ac834bfc1541ba11e49589.png

import numpy as np
np.mean(list(cc.values()))

0.3953333333333333

BA网络#

import networkx as nx
import matplotlib.pyplot as plt
BA= nx.random_graphs.barabasi_albert_graph(500,2)  
#生成n=200、m=2的BA无标度网络
pos = nx.spring_layout(BA)          
#定义一个布局，此处采用了spring布局方式
nx.draw(BA,pos,with_labels=False,node_size = 30)  
#绘制图形
plt.show()

_images/4c59ad0b53147223a09f1b95ffd5f6ccb3b68ac38a9d8c7f576ef0704e956789.png

plotDegreeDistribution(BA)

_images/a423474eb96058279dc41abde5dbcaa3a47f4470d2ba4f6eaddf4212f9a208d3.png

BA= nx.random_graphs.barabasi_albert_graph(20000,2)  
#生成n=20、m=1的BA无标度网络
plotDegreeDistribution(BA) 

_images/a96e70df4849651b1aea739dce77e3035268a01f5e9648d96bcc834a90e867b1.png

import networkx as nx
import matplotlib.pyplot as plt
BA= nx.random_graphs.barabasi_albert_graph(500,1)  
#生成n=20、m=1的BA无标度网络
pos = nx.spring_layout(BA)          
#定义一个布局，此处采用了spring布局方式
nx.draw(BA,pos,with_labels=False,node_size = 30)  
#绘制图形
plt.show()  

_images/ef1b2da7020a85fca686376c1a019cdddc55d0607ccf2401ee2ed63be154ea3e.png

nx.degree_histogram(BA)[:10]

[0, 342, 67, 39, 10, 12, 6, 5, 5, 3]

list(dict(BA.degree()).items())[:10]  

[(0, 12),
 (1, 37),
 (2, 8),
 (3, 7),
 (4, 2),
 (5, 11),
 (6, 8),
 (7, 11),
 (8, 1),
 (9, 14)]

plt.hist( list(dict(BA.degree()).values()) , bins = 100)
# plt.xscale('log')
# plt.yscale('log')
plt.show()

_images/ae3e66b0c3dc48fdfbbde7047e4252074fcbcfbd48654fda952df7b845f083ec.png

from collections import defaultdict
import numpy as np
def plotDegreeDistributionLongTail(G):
    degs = defaultdict(int)
    for i in list(dict(G.degree()).values()): degs[i]+=1
    items = sorted ( degs.items () )
    x, y = np.array(items).T
    y_sum = np.sum(y)
    y = [float(i)/y_sum for i in y]
    plt.plot(x, y, 'b-o')
    plt.legend(['Degree'])
    plt.xlabel('$K$', fontsize = 20)
    plt.ylabel('$P_K$', fontsize = 20)
    plt.title('$Degree\,Distribution$', fontsize = 20)
    plt.show()  
    
BA= nx.random_graphs.barabasi_albert_graph(5000,2)  
#生成n=20、m=1的BA无标度网络    
plotDegreeDistributionLongTail(BA)

_images/dbf3a69270c69f53633c13404e57f129eed2e9fb071d0a06b1efd8828b6559fd.png

def plotDegreeDistribution(G):
    degs = defaultdict(int)
    for i in list(dict(G.degree()).values()): degs[i]+=1
    items = sorted ( degs.items () )
    x, y = np.array(items).T
    x, y = np.array(items).T
    y_sum = np.sum(y)
    plt.plot(x, y, 'b-o')
    plt.xscale('log')
    plt.yscale('log')
    plt.legend(['Degree'])
    plt.xlabel('$K$', fontsize = 20)
    plt.ylabel('$P(K)$', fontsize = 20)
    plt.title('$Degree\,Distribution$', fontsize = 20)
    plt.show()   

BA= nx.random_graphs.barabasi_albert_graph(50000,2)  
#生成n=20、m=1的BA无标度网络        
plotDegreeDistribution(BA)

_images/aadcfc26e4494f68ee301e64254e928186e05704f678cb2fdadaf7162679cd02.png

作业

阅读 Barabasi (1999) Diameter of the world wide web.Nature.401
绘制www网络的出度分布、入度分布
使用BA模型生成节点数为N、幂指数为\(\gamma\)的网络
计算平均路径长度d与节点数量的关系

Ns = [i*10 for i in [1, 10, 100, 1000]]
ds = []
for N in Ns:
    print(N)
    BA= nx.random_graphs.barabasi_albert_graph(N,2)
    d = nx.average_shortest_path_length(BA)
    ds.append(d)

plt.plot(Ns, ds, 'r-o')
plt.xlabel('$N$', fontsize = 20)
plt.ylabel('$<d>$', fontsize = 20)
plt.xscale('log')
plt.show()

_images/0810ab35c524c30948d6c9bcd75ea9a6809d31c972308eb8c4126e9a81d06af6.png