使用Selenium提取人民网搜索数据

Contents

使用Selenium提取人民网搜索数据#

http://search.people.com.cn/cnpeople/news/getNewsResult.jsp

点击下一页页面的URL不变

鼠标右键查看页码

print(*range(1, 3))

1 2

#<a href="/cnpeople/search.do?pageNum=2&amp;keyword=%D0%C2%B9%DA&amp;siteName=news&amp;facetFlag=false&amp;nodeType=belongsId&amp;nodeId=">2</a>
url = 'http://search.people.com.cn/cnpeople/search.do?pageNum='
path = '&keyword=%D0%C2%B9%DA&siteName=news&facetFlag=null&nodeType=belongsId&nodeId='
page_num = range(1, 10)
urls = [url+str(i)+path for i in page_num]
for i in urls:
    print(i)
    

http://search.people.com.cn/cnpeople/search.do?pageNum=1&keyword=%D0%C2%B9%DA&siteName=news&facetFlag=null&nodeType=belongsId&nodeId=
http://search.people.com.cn/cnpeople/search.do?pageNum=2&keyword=%D0%C2%B9%DA&siteName=news&facetFlag=null&nodeType=belongsId&nodeId=
http://search.people.com.cn/cnpeople/search.do?pageNum=3&keyword=%D0%C2%B9%DA&siteName=news&facetFlag=null&nodeType=belongsId&nodeId=
http://search.people.com.cn/cnpeople/search.do?pageNum=4&keyword=%D0%C2%B9%DA&siteName=news&facetFlag=null&nodeType=belongsId&nodeId=
http://search.people.com.cn/cnpeople/search.do?pageNum=5&keyword=%D0%C2%B9%DA&siteName=news&facetFlag=null&nodeType=belongsId&nodeId=
http://search.people.com.cn/cnpeople/search.do?pageNum=6&keyword=%D0%C2%B9%DA&siteName=news&facetFlag=null&nodeType=belongsId&nodeId=
http://search.people.com.cn/cnpeople/search.do?pageNum=7&keyword=%D0%C2%B9%DA&siteName=news&facetFlag=null&nodeType=belongsId&nodeId=
http://search.people.com.cn/cnpeople/search.do?pageNum=8&keyword=%D0%C2%B9%DA&siteName=news&facetFlag=null&nodeType=belongsId&nodeId=
http://search.people.com.cn/cnpeople/search.do?pageNum=9&keyword=%D0%C2%B9%DA&siteName=news&facetFlag=null&nodeType=belongsId&nodeId=

无法通过requests直接获取#

提醒：您的访问可能对网站造成危险，已被云防护安全拦截

import requests
from bs4 import BeautifulSoup

content = requests.get(urls[0])
content.encoding = 'utf-8'

soup = BeautifulSoup(content.text, 'html.parser') 
soup

import requests
from bs4 import BeautifulSoup

content = requests.get(urls[0])
content.encoding = 'utf-8'
soup = BeautifulSoup(content.text, 'html.parser') 
soup

/opt/anaconda3/lib/python3.7/site-packages/requests/__init__.py:91: RequestsDependencyWarning: urllib3 (1.26.7) or chardet (3.0.4) doesn't match a supported version!
  RequestsDependencyWarning)

<!DOCTYPE html>

<html>
<head>
<meta charset="utf-8"/>
<style>
body{ background:#eff1f0; font-family: microsoft yahei; color:#969696; font-size:12px;}
.online-desc-con { text-align:center; }
.r-tip01 { color: #969696; font-size: 16px; display: block; text-align: center; width: 500px; padding: 0 10px; overflow: hidden; text-overflow: ellipsis; margin: 0 auto 15px; }
.r-tip02 { color: #b1b0b0; font-size: 12px; display: block; margin-top: 20px; margin-bottom: 20px; }
.r-tip02 a:visited { text-decoration: underline; color: #0088CC; }
.r-tip02 a:link { text-decoration: underline; color: #0088CC; }
img { border: 0; }
</style>
<script>
void(function fuckie6(){if(location.hash && /MSIE 6/.test(navigator.userAgent) && !/jsl_sec/.test(location.href)){location.href = location.href.split('#')[0] + '&jsl_sec' + location.hash}})();
var error_403 = '';
if(error_403 == '') {
	error_403 = '当前访问疑似黑客攻击，已被创宇盾拦截。';
}
</script>
</head>
<body>
<div class="online-desc-con" style="width:550px;padding-top:15px;margin:34px auto;">
<a href="http://www.365cyd.com" id="official_site" target="_blank">
<img alt="" id="wafblock" style="margin: 0 auto 17px auto;"/>
<script type="text/javascript">
            var pic = '' || 'hacker';
            document.getElementById("wafblock").src = '/cdn-cgi/image/' + pic + '.png';
        </script>
</a>
<span class="r-tip01"><script>document.write(error_403);</script></span>
<span class="r-tip02">如果您是网站管理员<a href="http://help.365cyd.com/cyd-error-help.html?code=403" target="_blank">点击这里</a>查看详情</span>
<hr/>
<center>client: 36.154.208.6, server: 38ae086, time: 01/Nov/2021:15:30:10 +0800 [80001]</center>
<img alt="" src="/cdn-cgi/image/logo.png"/>
</div>
</body>
</html>

for i in urls:
    print(i)

http://search.people.com.cn/cnpeople/search.do?pageNum=1&keyword=%D0%C2%B9%DA+%D6%D0%D2%BD&siteName=news&facetFlag=null&nodeType=belongsId&nodeId=
http://search.people.com.cn/cnpeople/search.do?pageNum=2&keyword=%D0%C2%B9%DA+%D6%D0%D2%BD&siteName=news&facetFlag=null&nodeType=belongsId&nodeId=

from selenium import webdriver
from bs4 import BeautifulSoup
import time

browser = webdriver.Chrome()
dat = []
for k, j in enumerate(urls):
    print(k+1)
    time.sleep(1)
    browser.get(j) 
    source = browser.page_source
    soup = BeautifulSoup(source, 'html.parser') 
    d = soup.find_all('ul')
    while len(d) < 2:
        print(k+1, 'null error and retry')
        time.sleep(1)
        browser.get(j) 
        source = browser.page_source
        soup = BeautifulSoup(source, 'html.parser') 
        d = soup.find_all('ul')
        
    for i in d[1:]:
        urli = i.find('a')['href']
        title = i.find('a').text
        time_stamp = i.find_all('li')[-1].text.split('\xa0')[-1]
        dat.append([k+1, urli, title, time_stamp])

browser.close()
len(dat)

from selenium import webdriver
from bs4 import BeautifulSoup
import time

browser = webdriver.Chrome()
dat = []
for k, j in enumerate(urls):
    print(k+1)
    time.sleep(1)
    browser.get(j) 
    source = browser.page_source
    soup = BeautifulSoup(source, 'html.parser') 
    d = soup.find_all('ul')
    while len(d) < 2:
        print(k+1, 'null error and retry')
        time.sleep(1)
        browser.get(j) 
        source = browser.page_source
        soup = BeautifulSoup(source, 'html.parser') 
        d = soup.find_all('ul')
        
    for i in d[1:]:
        urli = i.find('a')['href']
        title = i.find('a').text
        time_stamp = i.find_all('li')[-1].text.split('\xa0')[-1]
        dat.append([k+1, urli, title, time_stamp])

browser.close()
len(dat)

1
2
3
4
5
6
7
8
9
10
11
11 null error and retry
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29

import pandas as pd
df = pd.DataFrame(dat, columns = ['pagenum', 'url', 'title', 'time'])
df.head()

	pagenum	url	title	time
0	1	http://world.people.com.cn/n1/2021/1101/c1002-...	[美国白宫新闻秘书感染新冠病毒]	2021-11-01 13:17:19
1	1	http://society.people.com.cn/n1/2021/1101/c100...	[新冠疫苗第三针开打会有第四、第五针吗]	2021-11-01 09:09:21
2	1	http://society.people.com.cn/n1/2021/1101/c100...	[儿童新冠疫苗接种启动]	2021-11-01 09:04:42
3	1	http://health.people.com.cn/n1/2021/1101/c1473...	[改变过往新冠认知积极引导3—11岁人群疫苗“应接尽接”]	2021-11-01 08:57:45
4	1	http://usa.people.com.cn/n1/2021/1101/c241376-...	[美国空军大量士兵拒绝接种新冠疫苗]	2021-11-01 08:47:21

len(df)

df.to_csv('../data/people_com_search20200606.csv', index = False)

Reading data with Pandas#

with open('./data/people_com_search20200606.csv', 'r') as f:
    lines = f.readlines()
len(lines)

import pandas as pd
df2 = pd.read_csv('./data/people_com_search20200606.csv')
df2.head()
len(df2)

for i in df2['url'].tolist()[:10]:
    print(i)

http://health.people.com.cn/n1/2020/0606/c14739-31737564.html
http://health.people.com.cn/n1/2020/0606/c14739-31737424.html
http://politics.people.com.cn/n1/2020/0606/c1001-31737282.html
http://opinion.people.com.cn/n1/2020/0606/c1003-31737274.html
http://health.people.com.cn/n1/2020/0605/c14739-31736476.html
http://politics.people.com.cn/n1/2020/0605/c1001-31735966.html
http://world.people.com.cn/n1/2020/0604/c1002-31735815.html
http://www.people.com.cn/n1/2020/0604/c32306-31735734.html
http://cpc.people.com.cn/n1/2020/0604/c419242-31735244.html
http://health.people.com.cn/n1/2020/0604/c14739-31734772.html