snscrape#

snscrape is a scraper for social networking services (SNS). It scrapes things like user profiles, hashtags, or searches and returns the discovered items, e.g. the relevant posts.

JustAnotherArchivist/snscrape

The following services are currently supported:

  • Facebook: user profiles, groups, and communities (aka visitor posts)

  • Instagram: user profiles, hashtags, and locations

  • Mastodon: user profiles and toots (single or thread)

  • Reddit: users, subreddits, and searches (via Pushshift)

  • Telegram: channels

  • Twitter: users, user profiles, hashtags, searches, tweets (single or surrounding thread), list posts, and trends

  • VKontakte: user profiles

  • Weibo (Sina Weibo): user profiles

Requirements#

snscrape requires Python 3.8 or higher. The Python package dependencies are installed automatically when you install snscrape.

Note that one of the dependencies, lxml, also requires libxml2 and libxslt to be installed.

Installation#

pip3 install snscrape

If you want to use the development version:

pip3 install git+https://github.com/JustAnotherArchivist/snscrape.git

import lxml
#import libxml2
#import libxslt
!pip3 install snscrape
Collecting snscrape
  Downloading snscrape-0.4.3.20220106-py3-none-any.whl (59 kB)
     |████████████████████████████████| 59 kB 733 kB/s eta 0:00:01
?25hRequirement already satisfied: beautifulsoup4 in /opt/anaconda3/lib/python3.9/site-packages (from snscrape) (4.10.0)
Requirement already satisfied: lxml in /opt/anaconda3/lib/python3.9/site-packages (from snscrape) (4.6.3)
Requirement already satisfied: filelock in /opt/anaconda3/lib/python3.9/site-packages (from snscrape) (3.3.1)
Requirement already satisfied: requests[socks] in /opt/anaconda3/lib/python3.9/site-packages (from snscrape) (2.26.0)
Requirement already satisfied: soupsieve>1.2 in /opt/anaconda3/lib/python3.9/site-packages (from beautifulsoup4->snscrape) (2.2.1)
Requirement already satisfied: idna<4,>=2.5 in /opt/anaconda3/lib/python3.9/site-packages (from requests[socks]->snscrape) (3.2)
Requirement already satisfied: certifi>=2017.4.17 in /opt/anaconda3/lib/python3.9/site-packages (from requests[socks]->snscrape) (2021.10.8)
Requirement already satisfied: urllib3<1.27,>=1.21.1 in /opt/anaconda3/lib/python3.9/site-packages (from requests[socks]->snscrape) (1.26.7)
Requirement already satisfied: charset-normalizer~=2.0.0 in /opt/anaconda3/lib/python3.9/site-packages (from requests[socks]->snscrape) (2.0.4)
Requirement already satisfied: PySocks!=1.5.7,>=1.5.6 in /opt/anaconda3/lib/python3.9/site-packages (from requests[socks]->snscrape) (1.7.1)
Installing collected packages: snscrape
Successfully installed snscrape-0.4.3.20220106
## 导入所需要的包
import pandas as pd
import datetime
import os
import snscrape.modules.twitter as sntwitter
import pandas as pd
import time
import random
os.environ['http_proxy'] = "http://127.0.0.1:9999" 
os.environ['https_proxy'] = "http://127.0.0.1:9999" 
##获取日期列表
def get_date_list(begin_date, end_date):
    begin_date_dt = datetime.datetime.strptime(begin_date, "%Y-%m-%d")
    end_date_dt = datetime.datetime.strptime(end_date, "%Y-%m-%d")
    begin_date_second = (begin_date_dt + datetime.timedelta(days=1)).strftime("%Y-%m-%d")
    end_date_last = (end_date_dt + datetime.timedelta(days=-1)).strftime("%Y-%m-%d")
    since_date = pd.date_range(f"{begin_date}",f"{end_date_last}",freq="D").strftime("%Y%m%d").to_list()
    since_date = [dt[:4]+'-'+dt[4:6]+'-'+dt[6:8] for dt in since_date]
    until_date = pd.date_range(f"{begin_date_second}",f"{end_date}",freq="D").strftime("%Y%m%d").to_list()
    until_date = [dt[:4]+'-'+dt[4:6]+'-'+dt[6:8] for dt in until_date]
    return since_date,until_date

##保存文件到本地
def saveFile(df, path, filename):
    '''
    功能:将爬取保存到本地文件中
    参数:要保存的内容,路径,文件名
    '''
    # 如果没有该文件夹,则自动生成
    if not os.path.exists(path):
        os.makedirs(path)
        
    # 保存文件
    df.to_csv(path + filename)#index = False
    
##随机休眠
def random_sleep(mu=60,sigma=60):
    '''正态分布随机睡眠
    :param mu: 平均值
    :param sigma: 标准差,决定波动范围
    '''
    secs = random.normalvariate(mu, sigma)
    if secs <= 60:
        secs = mu  # 太小则重置为平均值
    time.sleep(secs)
if __name__ == '__main__':
    # 输入开始日期、结束日期、爬取关键词
    begin_date = input('请输入开始日期:')
    end_date = input('请输入结束日期:')
    keyword = input('请输入爬取关键词:')
    since_date,until_date = get_date_list(begin_date, end_date)
    for start_date,end_date in zip(since_date,until_date):
        year = start_date[:4]
        month = start_date[5:7]
        day = start_date[-2:]
        path = f"{keyword}" + '/' + year + '-' + month + '/'
        # Creating list to append tweet data
        tweets_list = []
        try:
        # Using TwitterSearchScraper to scrape data and append tweets to list
            for i,tweet in enumerate(sntwitter.TwitterSearchScraper('Bitcoin since:'+f"{start_date}"+' until:'+f"{end_date}").get_items()):
                tweets_list.append([
                                    tweet.date, tweet.url, tweet.id, tweet.rawContent, tweet.replyCount, tweet.retweetCount, tweet.likeCount, 
                                    tweet.quoteCount, tweet.conversationId, tweet.lang, tweet.source, tweet.sourceUrl, tweet.links, tweet.media,
                                    tweet.retweetedTweet, tweet.quotedTweet, tweet.inReplyToTweetId, tweet.inReplyToUser, tweet.mentionedUsers, 
                                    tweet.coordinates, tweet.place, tweet.hashtags, tweet.cashtags, tweet.card, #tweet相关的字段
                                    tweet.user.username, tweet.user.displayname, tweet.user.id, tweet.user.rawDescription, tweet.user.descriptionLinks,
                                    tweet.user.verified, tweet.user.created,tweet.user.followersCount, tweet.user.friendsCount, tweet.user.statusesCount, 
                                    tweet.user.favouritesCount,tweet.user.listedCount, tweet.user.mediaCount, tweet.user.location, tweet.user.protected, 
                                    tweet.user.link,tweet.user.profileImageUrl, tweet.user.profileBannerUrl, tweet.user.label ##user相关的字段
                                    ])
            # Creating a dataframe from the tweets list above
            tweets_df = pd.DataFrame(tweets_list, columns=[
                                                            'Datetime', 'Tweet_url', 'Tweet_id', 'Tweet_content', 'Tweet_reply_count', 'Tweet_retweet_count',
                                                            'Tweet_like_count', 'Tweet_quote_count', 'Tweet_conversation_id', 'Tweet_language', 'Tweet_source', 
                                                            'Tweet_source_url', 'Tweet_links', 'Tweet_mdeia', 'Tweet_retweeted_tweet', 'Tweet_quoted_tweet', 
                                                            'Tweet_inReplyToTweetId','Tweet_inReplyToUser','Tweet_mentioned_users','Tweet_coordinates', 'Tweet_place', 
                                                            'Tweet_hashtags', 'Tweet_cashtags','Tweet_card',#tweet 相关的字段
                                                            'Username','User_displayname', 'User_id', 'User_profile_description', 'User_description_link', 'User_verified',
                                                            'User_created', 'User_followers_count', 'User_friends_count', 'User_statuses_count', 'User_favourites_count',
                                                            'User_listed_count', 'User_media_count', 'User_location', 'User_protected', 'User_profile_link', 'User_profile_image_url',
                                                            'User_profile_banner_url', 'User_label' #user 相关的字段
                                                          ])

            fileName = year + '-' + month + '-' + day + '.csv'
            saveFile(tweets_df, path, fileName)
            print("爬取完成:" + year + '-' + month + '-' + day)
            random_sleep()       # 随机休眠
        except KeyError as error_msg:
            print(f"Sorry,{error_msg} is not a valid key!")
请输入开始日期:2022-02-10
请输入结束日期:2022-02-15
请输入爬取关键词:bitcoin
爬取完成:2022-02-10
爬取完成:2022-02-11
Tweet 1491943423862972423 contains an app icon medium key '4_1582211007291834369' on app 'android_app'/'com.gemini.android.app', but the corresponding medium is missing; dropping
爬取完成:2022-02-12
Tweet 1455815359609675778 contains an app icon medium key '4_1582070568056242197' on app 'android_app'/'de.traderepublic.app', but the corresponding medium is missing; dropping
爬取完成:2022-02-13
爬取完成:2022-02-14
tweets_list = []
for i,tweet in enumerate(sntwitter.TwitterTweetScraper(1565127293961248773).get_items()):
    tweets_list.append([
                        tweet.date, tweet.url, tweet.id, tweet.rawContent, tweet.replyCount, tweet.retweetCount, tweet.likeCount, 
                        tweet.quoteCount, tweet.conversationId, tweet.lang, tweet.source, tweet.sourceUrl, tweet.links, tweet.media,
                        tweet.retweetedTweet, tweet.quotedTweet, tweet.inReplyToTweetId, tweet.inReplyToUser, tweet.mentionedUsers, 
                        tweet.coordinates, tweet.place, tweet.hashtags, tweet.cashtags, tweet.card, #tweet相关的字段
                        tweet.user.username, tweet.user.displayname, tweet.user.id, tweet.user.rawDescription, tweet.user.descriptionLinks,
                        tweet.user.verified, tweet.user.created,tweet.user.followersCount, tweet.user.friendsCount, tweet.user.statusesCount, 
                        tweet.user.favouritesCount,tweet.user.listedCount, tweet.user.mediaCount, tweet.user.location, tweet.user.protected, 
                        tweet.user.link,tweet.user.profileImageUrl, tweet.user.profileBannerUrl, tweet.user.label ##user相关的字段
                        ])
            # Creating a dataframe from the tweets list above
tweets_df = pd.DataFrame(tweets_list, columns=[
                                                'Datetime', 'Tweet_url', 'Tweet_id', 'Tweet_content', 'Tweet_reply_count', 'Tweet_retweet_count',
                                                'Tweet_like_count', 'Tweet_quote_count', 'Tweet_conversation_id', 'Tweet_language', 'Tweet_source', 
                                                'Tweet_source_url', 'Tweet_links', 'Tweet_mdeia', 'Tweet_retweeted_tweet', 'Tweet_quoted_tweet', 
                                                'Tweet_inReplyToTweetId','Tweet_inReplyToUser','Tweet_mentioned_users','Tweet_coordinates', 'Tweet_place', 
                                                'Tweet_hashtags', 'Tweet_cashtags','Tweet_card',#tweet 相关的字段
                                                'Username','User_displayname', 'User_id', 'User_profile_description', 'User_description_link', 'User_verified',
                                                'User_created', 'User_followers_count', 'User_friends_count', 'User_statuses_count', 'User_favourites_count',
                                                'User_listed_count', 'User_media_count', 'User_location', 'User_protected', 'User_profile_link', 'User_profile_image_url',
                                                'User_profile_banner_url', 'User_label' #user 相关的字段
                                              ])
tweets_df
Datetime Tweet_url Tweet_id Tweet_content Tweet_reply_count Tweet_retweet_count Tweet_like_count Tweet_quote_count Tweet_conversation_id Tweet_language ... User_statuses_count User_favourites_count User_listed_count User_media_count User_location User_protected User_profile_link User_profile_image_url User_profile_banner_url User_label
0 2022-09-01 00:00:00+00:00 https://twitter.com/InformazioneA/status/15651... 1565127293961248773 #Report #31August 0 0 0 0 1565127293961248773 qht ... 18973 58 36 12139 False TextLink(text='informazionea.altervista.org', ... https://pbs.twimg.com/profile_images/123559427... https://pbs.twimg.com/profile_banners/12352445... None

1 rows × 43 columns