|
Twitter
```python
from obsei.source.twitter_source import TwitterCredentials, TwitterSource, TwitterSourceConfig
# initialize twitter source config
source_config = TwitterSourceConfig(
keywords=["issue"], # Keywords, @user or #hashtags
lookup_period="1h", # Lookup period from current time, format: `` (day|hour|minute)
cred_info=TwitterCredentials(
# Enter your twitter consumer key and secret. Get it from https://developer.twitter.com/en/apply-for-access
consumer_key="",
consumer_secret="",
bearer_token='',
)
)
# initialize tweets retriever
source = TwitterSource()
```
|
Youtube Scrapper
```python
from obsei.source.youtube_scrapper import YoutubeScrapperSource, YoutubeScrapperConfig
# initialize Youtube source config
source_config = YoutubeScrapperConfig(
video_url="https://www.youtube.com/watch?v=uZfns0JIlFk", # Youtube video URL
fetch_replies=True, # Fetch replies to comments
max_comments=10, # Total number of comments and replies to fetch
lookup_period="1Y", # Lookup period from current time, format: `` (day|hour|minute|month|year)
)
# initialize Youtube comments retriever
source = YoutubeScrapperSource()
```
|
Facebook
```python
from obsei.source.facebook_source import FacebookCredentials, FacebookSource, FacebookSourceConfig
# initialize facebook source config
source_config = FacebookSourceConfig(
page_id="110844591144719", # Facebook page id, for example this one for Obsei
lookup_period="1h", # Lookup period from current time, format: `` (day|hour|minute)
cred_info=FacebookCredentials(
# Enter your facebook app_id, app_secret and long_term_token. Get it from https://developers.facebook.com/apps/
app_id="",
app_secret="",
long_term_token="",
)
)
# initialize facebook post comments retriever
source = FacebookSource()
```
|
Email
```python
from obsei.source.email_source import EmailConfig, EmailCredInfo, EmailSource
# initialize email source config
source_config = EmailConfig(
# List of IMAP servers for most commonly used email providers
# https://www.systoolsgroup.com/imap/
# Also, if you're using a Gmail account then make sure you allow less secure apps on your account -
# https://myaccount.google.com/lesssecureapps?pli=1
# Also enable IMAP access -
# https://mail.google.com/mail/u/0/#settings/fwdandpop
imap_server="imap.gmail.com", # Enter IMAP server
cred_info=EmailCredInfo(
# Enter your email account username and password
username="",
password=""
),
lookup_period="1h" # Lookup period from current time, format: `` (day|hour|minute)
)
# initialize email retriever
source = EmailSource()
```
|
Google Maps Reviews Scrapper
```python
from obsei.source.google_maps_reviews import OSGoogleMapsReviewsSource, OSGoogleMapsReviewsConfig
# initialize Outscrapper Maps review source config
source_config = OSGoogleMapsReviewsConfig(
# Collect API key from https://outscraper.com/
api_key="",
# Enter Google Maps link or place id
# For example below is for the "Taj Mahal"
queries=["https://www.google.co.in/maps/place/Taj+Mahal/@27.1751496,78.0399535,17z/data=!4m5!3m4!1s0x39747121d702ff6d:0xdd2ae4803f767dde!8m2!3d27.1751448!4d78.0421422"],
number_of_reviews=10,
)
# initialize Outscrapper Maps review retriever
source = OSGoogleMapsReviewsSource()
```
|
AppStore Reviews Scrapper
```python
from obsei.source.appstore_scrapper import AppStoreScrapperConfig, AppStoreScrapperSource
# initialize app store source config
source_config = AppStoreScrapperConfig(
# Need two parameters app_id and country.
# `app_id` can be found at the end of the url of app in app store.
# For example - https://apps.apple.com/us/app/xcode/id497799835
# `310633997` is the app_id for xcode and `us` is country.
countries=["us"],
app_id="310633997",
lookup_period="1h" # Lookup period from current time, format: `` (day|hour|minute)
)
# initialize app store reviews retriever
source = AppStoreScrapperSource()
```
|
Play Store Reviews Scrapper
```python
from obsei.source.playstore_scrapper import PlayStoreScrapperConfig, PlayStoreScrapperSource
# initialize play store source config
source_config = PlayStoreScrapperConfig(
# Need two parameters package_name and country.
# `package_name` can be found at the end of the url of app in play store.
# For example - https://play.google.com/store/apps/details?id=com.google.android.gm&hl=en&gl=US
# `com.google.android.gm` is the package_name for xcode and `us` is country.
countries=["us"],
package_name="com.google.android.gm",
lookup_period="1h" # Lookup period from current time, format: `` (day|hour|minute)
)
# initialize play store reviews retriever
source = PlayStoreScrapperSource()
```
|
Reddit
```python
from obsei.source.reddit_source import RedditConfig, RedditSource, RedditCredInfo
# initialize reddit source config
source_config = RedditConfig(
subreddits=["wallstreetbets"], # List of subreddits
# Reddit account username and password
# You can also enter reddit client_id and client_secret or refresh_token
# Create credential at https://www.reddit.com/prefs/apps
# Also refer https://praw.readthedocs.io/en/latest/getting_started/authentication.html
# Currently Password Flow, Read Only Mode and Saved Refresh Token Mode are supported
cred_info=RedditCredInfo(
username="",
password=""
),
lookup_period="1h" # Lookup period from current time, format: `` (day|hour|minute)
)
# initialize reddit retriever
source = RedditSource()
```
|
Reddit Scrapper
Note: Reddit heavily rate limit scrappers, hence use it to fetch small data during long period
```python
from obsei.source.reddit_scrapper import RedditScrapperConfig, RedditScrapperSource
# initialize reddit scrapper source config
source_config = RedditScrapperConfig(
# Reddit subreddit, search etc rss url. For proper url refer following link -
# Refer https://www.reddit.com/r/pathogendavid/comments/tv8m9/pathogendavids_guide_to_rss_and_reddit/
url="https://www.reddit.com/r/wallstreetbets/comments/.rss?sort=new",
lookup_period="1h" # Lookup period from current time, format: `` (day|hour|minute)
)
# initialize reddit retriever
source = RedditScrapperSource()
```
|
Google News
```python
from obsei.source.google_news_source import GoogleNewsConfig, GoogleNewsSource
# initialize Google News source config
source_config = GoogleNewsConfig(
query='bitcoin',
max_results=5,
# To fetch full article text enable `fetch_article` flag
# By default google news gives title and highlight
fetch_article=True,
# proxy='http://127.0.0.1:8080'
)
# initialize Google News retriever
source = GoogleNewsSource()
```
|
Web Crawler
```python
from obsei.source.website_crawler_source import TrafilaturaCrawlerConfig, TrafilaturaCrawlerSource
# initialize website crawler source config
source_config = TrafilaturaCrawlerConfig(
urls=['https://obsei.github.io/obsei/']
)
# initialize website text retriever
source = TrafilaturaCrawlerSource()
```
|
Pandas DataFrame
```python
import pandas as pd
from obsei.source.pandas_source import PandasSource, PandasSourceConfig
# Initialize your Pandas DataFrame from your sources like csv, excel, sql etc
# In following example we are reading csv which have two columns title and text
csv_file = "https://raw.githubusercontent.com/deepset-ai/haystack/master/tutorials/small_generator_dataset.csv"
dataframe = pd.read_csv(csv_file)
# initialize pandas sink config
sink_config = PandasSourceConfig(
dataframe=dataframe,
include_columns=["score"],
text_columns=["name", "degree"],
)
# initialize pandas sink
sink = PandasSource()
```
|