snscrape#
snscrape is a scraper for social networking services (SNS). It scrapes things like user profiles, hashtags, or searches and returns the discovered items, e.g. the relevant posts.
The following services are currently supported:
Facebook: user profiles, groups, and communities (aka visitor posts)
Instagram: user profiles, hashtags, and locations
Mastodon: user profiles and toots (single or thread)
Reddit: users, subreddits, and searches (via Pushshift)
Telegram: channels
Twitter: users, user profiles, hashtags, searches, tweets (single or surrounding thread), list posts, and trends
VKontakte: user profiles
Weibo (Sina Weibo): user profiles
Requirements#
snscrape requires Python 3.8 or higher. The Python package dependencies are installed automatically when you install snscrape.
Note that one of the dependencies, lxml, also requires libxml2 and libxslt to be installed.
Installation#
pip3 install snscrape
If you want to use the development version:
pip3 install git+https://github.com/JustAnotherArchivist/snscrape.git
import lxml
#import libxml2
#import libxslt
!pip3 install snscrape
Collecting snscrape
Downloading snscrape-0.4.3.20220106-py3-none-any.whl (59 kB)
|████████████████████████████████| 59 kB 733 kB/s eta 0:00:01
?25hRequirement already satisfied: beautifulsoup4 in /opt/anaconda3/lib/python3.9/site-packages (from snscrape) (4.10.0)
Requirement already satisfied: lxml in /opt/anaconda3/lib/python3.9/site-packages (from snscrape) (4.6.3)
Requirement already satisfied: filelock in /opt/anaconda3/lib/python3.9/site-packages (from snscrape) (3.3.1)
Requirement already satisfied: requests[socks] in /opt/anaconda3/lib/python3.9/site-packages (from snscrape) (2.26.0)
Requirement already satisfied: soupsieve>1.2 in /opt/anaconda3/lib/python3.9/site-packages (from beautifulsoup4->snscrape) (2.2.1)
Requirement already satisfied: idna<4,>=2.5 in /opt/anaconda3/lib/python3.9/site-packages (from requests[socks]->snscrape) (3.2)
Requirement already satisfied: certifi>=2017.4.17 in /opt/anaconda3/lib/python3.9/site-packages (from requests[socks]->snscrape) (2021.10.8)
Requirement already satisfied: urllib3<1.27,>=1.21.1 in /opt/anaconda3/lib/python3.9/site-packages (from requests[socks]->snscrape) (1.26.7)
Requirement already satisfied: charset-normalizer~=2.0.0 in /opt/anaconda3/lib/python3.9/site-packages (from requests[socks]->snscrape) (2.0.4)
Requirement already satisfied: PySocks!=1.5.7,>=1.5.6 in /opt/anaconda3/lib/python3.9/site-packages (from requests[socks]->snscrape) (1.7.1)
Installing collected packages: snscrape
Successfully installed snscrape-0.4.3.20220106
## 导入所需要的包
import pandas as pd
import datetime
import os
import snscrape.modules.twitter as sntwitter
import pandas as pd
import time
import random
os.environ['http_proxy'] = "http://127.0.0.1:9999"
os.environ['https_proxy'] = "http://127.0.0.1:9999"
##获取日期列表
def get_date_list(begin_date, end_date):
begin_date_dt = datetime.datetime.strptime(begin_date, "%Y-%m-%d")
end_date_dt = datetime.datetime.strptime(end_date, "%Y-%m-%d")
begin_date_second = (begin_date_dt + datetime.timedelta(days=1)).strftime("%Y-%m-%d")
end_date_last = (end_date_dt + datetime.timedelta(days=-1)).strftime("%Y-%m-%d")
since_date = pd.date_range(f"{begin_date}",f"{end_date_last}",freq="D").strftime("%Y%m%d").to_list()
since_date = [dt[:4]+'-'+dt[4:6]+'-'+dt[6:8] for dt in since_date]
until_date = pd.date_range(f"{begin_date_second}",f"{end_date}",freq="D").strftime("%Y%m%d").to_list()
until_date = [dt[:4]+'-'+dt[4:6]+'-'+dt[6:8] for dt in until_date]
return since_date,until_date
##保存文件到本地
def saveFile(df, path, filename):
'''
功能:将爬取保存到本地文件中
参数:要保存的内容,路径,文件名
'''
# 如果没有该文件夹,则自动生成
if not os.path.exists(path):
os.makedirs(path)
# 保存文件
df.to_csv(path + filename)#index = False
##随机休眠
def random_sleep(mu=60,sigma=60):
'''正态分布随机睡眠
:param mu: 平均值
:param sigma: 标准差,决定波动范围
'''
secs = random.normalvariate(mu, sigma)
if secs <= 60:
secs = mu # 太小则重置为平均值
time.sleep(secs)
if __name__ == '__main__':
# 输入开始日期、结束日期、爬取关键词
begin_date = input('请输入开始日期:')
end_date = input('请输入结束日期:')
keyword = input('请输入爬取关键词:')
since_date,until_date = get_date_list(begin_date, end_date)
for start_date,end_date in zip(since_date,until_date):
year = start_date[:4]
month = start_date[5:7]
day = start_date[-2:]
path = f"{keyword}" + '/' + year + '-' + month + '/'
# Creating list to append tweet data
tweets_list = []
try:
# Using TwitterSearchScraper to scrape data and append tweets to list
for i,tweet in enumerate(sntwitter.TwitterSearchScraper('Bitcoin since:'+f"{start_date}"+' until:'+f"{end_date}").get_items()):
tweets_list.append([
tweet.date, tweet.url, tweet.id, tweet.rawContent, tweet.replyCount, tweet.retweetCount, tweet.likeCount,
tweet.quoteCount, tweet.conversationId, tweet.lang, tweet.source, tweet.sourceUrl, tweet.links, tweet.media,
tweet.retweetedTweet, tweet.quotedTweet, tweet.inReplyToTweetId, tweet.inReplyToUser, tweet.mentionedUsers,
tweet.coordinates, tweet.place, tweet.hashtags, tweet.cashtags, tweet.card, #tweet相关的字段
tweet.user.username, tweet.user.displayname, tweet.user.id, tweet.user.rawDescription, tweet.user.descriptionLinks,
tweet.user.verified, tweet.user.created,tweet.user.followersCount, tweet.user.friendsCount, tweet.user.statusesCount,
tweet.user.favouritesCount,tweet.user.listedCount, tweet.user.mediaCount, tweet.user.location, tweet.user.protected,
tweet.user.link,tweet.user.profileImageUrl, tweet.user.profileBannerUrl, tweet.user.label ##user相关的字段
])
# Creating a dataframe from the tweets list above
tweets_df = pd.DataFrame(tweets_list, columns=[
'Datetime', 'Tweet_url', 'Tweet_id', 'Tweet_content', 'Tweet_reply_count', 'Tweet_retweet_count',
'Tweet_like_count', 'Tweet_quote_count', 'Tweet_conversation_id', 'Tweet_language', 'Tweet_source',
'Tweet_source_url', 'Tweet_links', 'Tweet_mdeia', 'Tweet_retweeted_tweet', 'Tweet_quoted_tweet',
'Tweet_inReplyToTweetId','Tweet_inReplyToUser','Tweet_mentioned_users','Tweet_coordinates', 'Tweet_place',
'Tweet_hashtags', 'Tweet_cashtags','Tweet_card',#tweet 相关的字段
'Username','User_displayname', 'User_id', 'User_profile_description', 'User_description_link', 'User_verified',
'User_created', 'User_followers_count', 'User_friends_count', 'User_statuses_count', 'User_favourites_count',
'User_listed_count', 'User_media_count', 'User_location', 'User_protected', 'User_profile_link', 'User_profile_image_url',
'User_profile_banner_url', 'User_label' #user 相关的字段
])
fileName = year + '-' + month + '-' + day + '.csv'
saveFile(tweets_df, path, fileName)
print("爬取完成:" + year + '-' + month + '-' + day)
random_sleep() # 随机休眠
except KeyError as error_msg:
print(f"Sorry,{error_msg} is not a valid key!")
请输入开始日期:2022-02-10
请输入结束日期:2022-02-15
请输入爬取关键词:bitcoin
爬取完成:2022-02-10
爬取完成:2022-02-11
Tweet 1491943423862972423 contains an app icon medium key '4_1582211007291834369' on app 'android_app'/'com.gemini.android.app', but the corresponding medium is missing; dropping
爬取完成:2022-02-12
Tweet 1455815359609675778 contains an app icon medium key '4_1582070568056242197' on app 'android_app'/'de.traderepublic.app', but the corresponding medium is missing; dropping
爬取完成:2022-02-13
爬取完成:2022-02-14
tweets_list = []
for i,tweet in enumerate(sntwitter.TwitterTweetScraper(1565127293961248773).get_items()):
tweets_list.append([
tweet.date, tweet.url, tweet.id, tweet.rawContent, tweet.replyCount, tweet.retweetCount, tweet.likeCount,
tweet.quoteCount, tweet.conversationId, tweet.lang, tweet.source, tweet.sourceUrl, tweet.links, tweet.media,
tweet.retweetedTweet, tweet.quotedTweet, tweet.inReplyToTweetId, tweet.inReplyToUser, tweet.mentionedUsers,
tweet.coordinates, tweet.place, tweet.hashtags, tweet.cashtags, tweet.card, #tweet相关的字段
tweet.user.username, tweet.user.displayname, tweet.user.id, tweet.user.rawDescription, tweet.user.descriptionLinks,
tweet.user.verified, tweet.user.created,tweet.user.followersCount, tweet.user.friendsCount, tweet.user.statusesCount,
tweet.user.favouritesCount,tweet.user.listedCount, tweet.user.mediaCount, tweet.user.location, tweet.user.protected,
tweet.user.link,tweet.user.profileImageUrl, tweet.user.profileBannerUrl, tweet.user.label ##user相关的字段
])
# Creating a dataframe from the tweets list above
tweets_df = pd.DataFrame(tweets_list, columns=[
'Datetime', 'Tweet_url', 'Tweet_id', 'Tweet_content', 'Tweet_reply_count', 'Tweet_retweet_count',
'Tweet_like_count', 'Tweet_quote_count', 'Tweet_conversation_id', 'Tweet_language', 'Tweet_source',
'Tweet_source_url', 'Tweet_links', 'Tweet_mdeia', 'Tweet_retweeted_tweet', 'Tweet_quoted_tweet',
'Tweet_inReplyToTweetId','Tweet_inReplyToUser','Tweet_mentioned_users','Tweet_coordinates', 'Tweet_place',
'Tweet_hashtags', 'Tweet_cashtags','Tweet_card',#tweet 相关的字段
'Username','User_displayname', 'User_id', 'User_profile_description', 'User_description_link', 'User_verified',
'User_created', 'User_followers_count', 'User_friends_count', 'User_statuses_count', 'User_favourites_count',
'User_listed_count', 'User_media_count', 'User_location', 'User_protected', 'User_profile_link', 'User_profile_image_url',
'User_profile_banner_url', 'User_label' #user 相关的字段
])
tweets_df
Datetime | Tweet_url | Tweet_id | Tweet_content | Tweet_reply_count | Tweet_retweet_count | Tweet_like_count | Tweet_quote_count | Tweet_conversation_id | Tweet_language | ... | User_statuses_count | User_favourites_count | User_listed_count | User_media_count | User_location | User_protected | User_profile_link | User_profile_image_url | User_profile_banner_url | User_label | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2022-09-01 00:00:00+00:00 | https://twitter.com/InformazioneA/status/15651... | 1565127293961248773 | #Report #31August | 0 | 0 | 0 | 0 | 1565127293961248773 | qht | ... | 18973 | 58 | 36 | 12139 | False | TextLink(text='informazionea.altervista.org', ... | https://pbs.twimg.com/profile_images/123559427... | https://pbs.twimg.com/profile_banners/12352445... | None |
1 rows × 43 columns