Spaces:

politweet-sh
/

politweet

Runtime error

App Files Files Community

Demea9000 commited on Jul 7, 2022

Commit

b24e23b

1 Parent(s): 76b4944

added TwitterScraper file, changed requirements.txt

Browse files

Files changed (3) hide show

requirements.txt +28 -0
twitter-scraper/TwitterScraper.py +17 -0
twitter-scraper/twint-master/scrape.py +45 -45

requirements.txt CHANGED Viewed

@@ -1,23 +1,51 @@
 certifi==2022.6.15
 charset-normalizer==2.1.0
 cycler==0.11.0
 et-xmlfile==1.1.0
 fonttools==4.34.0
 idna==3.3
 kiwisolver==1.4.3
 matplotlib==3.5.2
 numpy==1.23.0
 openai==0.20.0
 openpyxl==3.0.10
 packaging==21.3
 pandas==1.4.3
 pandas-stubs==1.4.3.220704
 Pillow==9.2.0
 pyparsing==3.0.9
 python-dateutil==2.8.2
 pytz==2022.1
 regex==2022.6.2
 requests==2.28.1
 six==1.16.0
 tqdm==4.64.0
 urllib3==1.26.9

+aiodns==3.0.0
+aiohttp==3.8.1
+aiohttp-socks==0.7.1
+aiosignal==1.2.0
+async-timeout==4.0.2
+attrs==21.4.0
+beautifulsoup4==4.11.1
+cchardet==2.1.7
 certifi==2022.6.15
+cffi==1.15.1
 charset-normalizer==2.1.0
 cycler==0.11.0
+dataclasses==0.6
+elastic-transport==8.1.2
+elasticsearch==8.3.1
 et-xmlfile==1.1.0
+fake-useragent==0.1.11
 fonttools==4.34.0
+frozenlist==1.3.0
+geographiclib==1.52
+geopy==2.2.0
+googletransx==2.4.2
 idna==3.3
 kiwisolver==1.4.3
 matplotlib==3.5.2
+multidict==6.0.2
 numpy==1.23.0
+oauthlib==3.2.0
 openai==0.20.0
 openpyxl==3.0.10
 packaging==21.3
 pandas==1.4.3
 pandas-stubs==1.4.3.220704
 Pillow==9.2.0
+pycares==4.2.1
+pycparser==2.21
 pyparsing==3.0.9
+PySocks==1.7.1
 python-dateutil==2.8.2
+python-socks==2.0.3
 pytz==2022.1
 regex==2022.6.2
 requests==2.28.1
+requests-oauthlib==1.3.1
+schedule==1.1.0
 six==1.16.0
+soupsieve==2.3.2.post1
 tqdm==4.64.0
+-e git+https://github.com/twintproject/twint.git@e7c8a0c764f6879188e5c21e25fb6f1f856a7221#egg=twint
 urllib3==1.26.9
+yarl==1.7.2

twitter-scraper/TwitterScraper.py ADDED Viewed

	@@ -0,0 +1,17 @@

+import twint
+import datetime
+c = twint.Config()
+c.Search = ['Taylor Swift']  # topic
+c.Limit = 500  # number of Tweets to scrape
+c.Store_csv = True  # store tweets in a csv file
+c.Output = "taylor_swift_tweets.csv"  # path to csv file
+twint.run.Search(c)
+import pandas as pd
+df = pd.read_csv('taylor_swift_tweets.csv')
+print(df.head())

twitter-scraper/twint-master/scrape.py CHANGED Viewed

@@ -5,47 +5,52 @@ import asyncio
 import os
 from tkinter import EXCEPTION
 from numpy import not_equal
 loop = asyncio.get_event_loop()
 loop.is_running()
 import twint
 import nest_asyncio
 nest_asyncio.apply()
 from datetime import date
 class scraper:
-    def get_tweets(search_str, from_date="2006-07-01", to_date=str(date.today()), num_tweets=10,u_or_s='s', acceptable_range=10):
-        if (type(from_date) or   type("str")) is not type("str"):
-            print("[!] Please make sure the date is a string in this format \"yyyy-mm-dd\" ")
             raise EXCEPTION("Incorrect date type Exception!")
-        time_out= time.time()+2*60
-        _dict={}
-        c=twint.Config()
-        if u_or_s.lower() =="u":
-            c.Search = "from:@"+search_str # topic
-        else:
-            c.Search = search_str       # topic
         c.Pandas = True
-        num_tweets_and_replies=num_tweets
-        c.Count=True
-        for j in range(1,5):
             c.Limit = num_tweets_and_replies
             c.Since = from_date
-            c.Until =  to_date
-            c.Hide_output =True
             old_stdout = sys.stdout
             new_stdout = io.StringIO()
             sys.stdout = new_stdout
-            twint.run.Search(c)
             output = new_stdout.getvalue()
             sys.stdout = old_stdout
             print(output[0:-2])
-            tweet_info=twint.output.panda.Tweets_df
-            t_count=0
             try:
-                _keys=tweet_info["id"]
-                #tweet infor is a dataframe with fallowing columns
                 '''Index(['id', 'conversation_id', 'created_at', 'date', 'timezone', 'place',
                 'tweet', 'language', 'hashtags', 'cashtags', 'user_id', 'user_id_str',
                 'username', 'name', 'day', 'hour', 'link', 'urls', 'photos', 'video',
@@ -55,48 +60,43 @@ class scraper:
                 'trans_dest'],
                 dtype='object')'''
-                for i in range (len(_keys)):
                     if _keys[i] in _dict.keys() or tweet_info["tweet"][i].startswith("@"):
                         pass
                     else:
                         _dict[int(_keys[i])] = {"tweet": tweet_info["tweet"][i],
-                                                "date" :tweet_info["date"][i],
                                                 "nlikes": tweet_info["nlikes"][i],
-                                                "nreplies":tweet_info["nreplies"][i] ,
-                                                "nretweets": tweet_info["nretweets"][i],"topic":""}
-                        if len(list(_dict.keys()))==num_tweets:
                             break
             except:
                 pass
             print(len(list(_dict.keys())), " of them are Tweets")
-            if (num_tweets-len(list(_dict.keys())))< acceptable_range:
                 return _dict
             if len(list(_dict.keys())) < num_tweets:
-                num_tweets_and_replies= num_tweets_and_replies+100*3**j
             else:
                 break
-            if time_out <time.time():
                 break
             if output.startswith("[!] No more data!"):
-                break
         return _dict
-    def string_search_user_tweets(user_name,search_str ,from_date="2006-07-01", to_date=str(date.today()), num_tweets=10):
-        c=twint.Config()
-        c.Username =user_name
-        c.Search = search_str       # topic
         c.Pandas = True
-        num_tweets_and_replies=num_tweets
-        c.Count=True
         c.Limit = num_tweets_and_replies
         c.Since = from_date
-        c.Until =  to_date
-        c.Hide_output =True
         twint.run.Search(c)
         return twint.output.panda.Tweets_df

 import os
 from tkinter import EXCEPTION
 from numpy import not_equal
 loop = asyncio.get_event_loop()
 loop.is_running()
 import twint
 import nest_asyncio
 nest_asyncio.apply()
 from datetime import date
 class scraper:
+    def get_tweets(search_str, from_date="2006-07-01", to_date=str(date.today()), num_tweets=10, u_or_s='s',
+                   acceptable_range=10):
+        if (type(from_date) or type("str")) is not type("str"):
+            print("[!] Please make sure the date is a string in this format \"yyyy-mm-dd\" ")
             raise EXCEPTION("Incorrect date type Exception!")
+        time_out = time.time() + 2 * 60
+        _dict = {}
+        c = twint.Config()
+        if u_or_s.lower() == "u":
+            c.Search = "from:@" + search_str  # topic
+        else:
+            c.Search = search_str  # topic
         c.Pandas = True
+        num_tweets_and_replies = num_tweets
+        c.Count = True
+        for j in range(1, 5):
             c.Limit = num_tweets_and_replies
             c.Since = from_date
+            c.Until = to_date
+            c.Hide_output = True
             old_stdout = sys.stdout
             new_stdout = io.StringIO()
             sys.stdout = new_stdout
+            twint.run.Search(c)
             output = new_stdout.getvalue()
             sys.stdout = old_stdout
             print(output[0:-2])
+            tweet_info = twint.output.panda.Tweets_df
+            t_count = 0
             try:
+                _keys = tweet_info["id"]
+                # tweet infor is a dataframe with fallowing columns
                 '''Index(['id', 'conversation_id', 'created_at', 'date', 'timezone', 'place',
                 'tweet', 'language', 'hashtags', 'cashtags', 'user_id', 'user_id_str',
                 'username', 'name', 'day', 'hour', 'link', 'urls', 'photos', 'video',
                 'trans_dest'],
                 dtype='object')'''
+                for i in range(len(_keys)):
                     if _keys[i] in _dict.keys() or tweet_info["tweet"][i].startswith("@"):
                         pass
                     else:
                         _dict[int(_keys[i])] = {"tweet": tweet_info["tweet"][i],
+                                                "date": tweet_info["date"][i],
                                                 "nlikes": tweet_info["nlikes"][i],
+                                                "nreplies": tweet_info["nreplies"][i],
+                                                "nretweets": tweet_info["nretweets"][i], "topic": ""}
+                        if len(list(_dict.keys())) == num_tweets:
                             break
             except:
                 pass
             print(len(list(_dict.keys())), " of them are Tweets")
+            if (num_tweets - len(list(_dict.keys()))) < acceptable_range:
                 return _dict
             if len(list(_dict.keys())) < num_tweets:
+                num_tweets_and_replies = num_tweets_and_replies + 100 * 3 ** j
             else:
                 break
+            if time_out < time.time():
                 break
             if output.startswith("[!] No more data!"):
+                break
         return _dict
+    def string_search_user_tweets(user_name, search_str, from_date="2006-07-01", to_date=str(date.today()),
+                                  num_tweets=10):
+        c = twint.Config()
+        c.Username = user_name
+        c.Search = search_str  # topic
         c.Pandas = True
+        num_tweets_and_replies = num_tweets
+        c.Count = True
         c.Limit = num_tweets_and_replies
         c.Since = from_date
+        c.Until = to_date
+        c.Hide_output = True
         twint.run.Search(c)
         return twint.output.panda.Tweets_df