diff --git a/.gitattributes b/.gitattributes
index a6344aac8c09253b3b630fb776ae94478aa0275b..76d0075938bab7774619ea0d772ab821a57c9aad 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
*.zip filter=lfs diff=lfs merge=lfs -text
*.zst filter=lfs diff=lfs merge=lfs -text
*tfevents* filter=lfs diff=lfs merge=lfs -text
+obsei_module/images/obsei_flow.gif filter=lfs diff=lfs merge=lfs -text
+obsei_module/obsei-master/images/obsei_flow.gif filter=lfs diff=lfs merge=lfs -text
diff --git a/obsei_module/.github/ISSUE_TEMPLATE/bug_report.md b/obsei_module/.github/ISSUE_TEMPLATE/bug_report.md
new file mode 100644
index 0000000000000000000000000000000000000000..c6915c4ae905cb402e1dc710b3daafb8f6360df4
--- /dev/null
+++ b/obsei_module/.github/ISSUE_TEMPLATE/bug_report.md
@@ -0,0 +1,27 @@
+---
+name: Bug report
+about: Create a report to help us improve
+title: "[BUG]"
+labels: bug
+assignees: lalitpagaria
+
+---
+
+**Describe the bug**
+A clear and concise description of what the bug is.
+
+**To Reproduce**
+Steps to reproduce the behavior:
+
+**Expected behavior**
+A clear and concise description of what you expected to happen.
+
+**Stacktrace**
+If applicable, add stacktrace to help explain your problem.
+
+**Please complete the following information:**
+ - OS:
+ - Version:
+
+**Additional context**
+Add any other context about the problem here.
diff --git a/obsei_module/.github/ISSUE_TEMPLATE/feature_request.md b/obsei_module/.github/ISSUE_TEMPLATE/feature_request.md
new file mode 100644
index 0000000000000000000000000000000000000000..11fc491ef1dae316f2b06bbb40eaba9c757fdfd1
--- /dev/null
+++ b/obsei_module/.github/ISSUE_TEMPLATE/feature_request.md
@@ -0,0 +1,20 @@
+---
+name: Feature request
+about: Suggest an idea for this project
+title: ''
+labels: enhancement
+assignees: ''
+
+---
+
+**Is your feature request related to a problem? Please describe.**
+A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
+
+**Describe the solution you'd like**
+A clear and concise description of what you want to happen.
+
+**Describe alternatives you've considered**
+A clear and concise description of any alternative solutions or features you've considered.
+
+**Additional context**
+Add any other context or screenshots about the feature request here.
diff --git a/obsei_module/.github/dependabot.yml b/obsei_module/.github/dependabot.yml
new file mode 100644
index 0000000000000000000000000000000000000000..2c7d1708395e202b3b3316391f35bf4183ebd045
--- /dev/null
+++ b/obsei_module/.github/dependabot.yml
@@ -0,0 +1,7 @@
+version: 2
+updates:
+ # Maintain dependencies for GitHub Actions
+ - package-ecosystem: "github-actions"
+ directory: "/"
+ schedule:
+ interval: "daily"
diff --git a/obsei_module/.github/release-drafter.yml b/obsei_module/.github/release-drafter.yml
new file mode 100644
index 0000000000000000000000000000000000000000..794187190e6f3fb290174970df09c18306b58a39
--- /dev/null
+++ b/obsei_module/.github/release-drafter.yml
@@ -0,0 +1,33 @@
+name-template: 'v$RESOLVED_VERSION 🌈'
+tag-template: 'v$RESOLVED_VERSION'
+categories:
+ - title: '🚀 Features'
+ labels:
+ - 'feature'
+ - 'enhancement'
+ - title: '🐛 Bug Fixes'
+ labels:
+ - 'fix'
+ - 'bugfix'
+ - 'bug'
+ - title: '🧰 Maintenance'
+ label: 'chore'
+ - title: '⚠️Breaking Changes'
+ label: 'breaking changes'
+change-template: '- $TITLE @$AUTHOR (#$NUMBER)'
+change-title-escapes: '\<*_&' # You can add # and @ to disable mentions, and add ` to disable code blocks.
+version-resolver:
+ major:
+ labels:
+ - 'major'
+ minor:
+ labels:
+ - 'minor'
+ patch:
+ labels:
+ - 'patch'
+ default: patch
+template: |
+ ## Changes
+
+ $CHANGES
\ No newline at end of file
diff --git a/obsei_module/.github/workflows/build.yml b/obsei_module/.github/workflows/build.yml
new file mode 100644
index 0000000000000000000000000000000000000000..767b04e369bceb740995187c9c3dfda5e3a90325
--- /dev/null
+++ b/obsei_module/.github/workflows/build.yml
@@ -0,0 +1,54 @@
+# This workflow will install Python dependencies, run test and lint with a single version of Python
+# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
+
+name: CI
+
+on:
+ push:
+ branches: [ master ]
+ pull_request:
+ branches: [ master ]
+
+jobs:
+ type-check:
+ runs-on: ubuntu-latest
+ steps:
+ - uses: actions/checkout@v4
+ - uses: actions/setup-python@v5
+ with:
+ python-version: '3.10'
+ - name: Test with mypy
+ run: |
+ pip install mypy
+ # Refer http://mypy-lang.blogspot.com/2021/06/mypy-0900-released.html
+ pip install mypy types-requests types-python-dateutil types-PyYAML types-dateparser types-protobuf types-pytz
+ mypy obsei
+
+ build-and-test:
+ needs: type-check
+ runs-on: ${{ matrix.os }}
+ strategy:
+ fail-fast: false
+ matrix:
+ os: [ ubuntu-latest, macos-latest, windows-latest ]
+ python-version: ['3.8', '3.9', '3.10', '3.11']
+
+ steps:
+ - uses: actions/checkout@v4
+ - name: Set up Python ${{ matrix.python-version }}
+ uses: actions/setup-python@v5
+ with:
+ python-version: ${{ matrix.python-version }}
+
+ - name: Install dependencies
+ run: |
+ python -m pip install --upgrade pip
+ pip install '.[dev,all]'
+ pip install --upgrade --upgrade-strategy eager trafilatura
+ python -m spacy download en_core_web_lg
+ python -m spacy download en_core_web_sm
+
+ - name: Test with pytest
+ run: |
+ coverage run -m pytest
+ coverage report -m
diff --git a/obsei_module/.github/workflows/pypi_publish.yml b/obsei_module/.github/workflows/pypi_publish.yml
new file mode 100644
index 0000000000000000000000000000000000000000..316334bb75c8e3fff0974b52fa85ffadcbb0b289
--- /dev/null
+++ b/obsei_module/.github/workflows/pypi_publish.yml
@@ -0,0 +1,35 @@
+# This workflows will upload a Python Package using Twine when a release is created
+# For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries
+
+name: Upload Python Package
+
+on:
+ workflow_dispatch:
+ release:
+ types: [published]
+
+jobs:
+ deploy-pypi-artifact:
+ runs-on: ubuntu-latest
+
+ steps:
+ - uses: actions/checkout@v4
+
+ - name: Set up Python
+ uses: actions/setup-python@v5
+ with:
+ python-version: '3.8'
+
+ - name: Install dependencies
+ run: |
+ python -m pip install --upgrade pip
+ pip install setuptools wheel twine hatch
+
+ - name: publish to PyPI
+ if: github.event_name != 'pull_request'
+ env:
+ TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
+ TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
+ run: |
+ hatch build
+ twine upload dist/*
diff --git a/obsei_module/.github/workflows/release_draft.yml b/obsei_module/.github/workflows/release_draft.yml
new file mode 100644
index 0000000000000000000000000000000000000000..2ed3737754610ea9c71896646975b34355580b4e
--- /dev/null
+++ b/obsei_module/.github/workflows/release_draft.yml
@@ -0,0 +1,15 @@
+name: release draft
+
+on:
+ workflow_dispatch:
+
+jobs:
+ draft-release:
+# if: startsWith(github.ref, 'refs/tags/')
+ runs-on: ubuntu-latest
+ steps:
+ - uses: release-drafter/release-drafter@v6
+ with:
+ config-name: release-drafter.yml
+ env:
+ GITHUB_TOKEN: ${{ secrets.RELEASE_DRAFT_TOKEN }}
\ No newline at end of file
diff --git a/obsei_module/.github/workflows/sdk_docker_publish.yml b/obsei_module/.github/workflows/sdk_docker_publish.yml
new file mode 100644
index 0000000000000000000000000000000000000000..70daa5e698326d30b5d1cee9ba8a5e9213bda1b7
--- /dev/null
+++ b/obsei_module/.github/workflows/sdk_docker_publish.yml
@@ -0,0 +1,50 @@
+# This workflows will upload a Python Package using Twine when a release is created
+# For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries
+
+name: Publish SDK docker image
+
+on:
+ workflow_dispatch:
+ inputs:
+ tag:
+ description: 'Image tag'
+ required: true
+ release:
+ types: [published]
+
+jobs:
+ deploy-sdk-docker:
+ runs-on: ubuntu-latest
+ steps:
+ - uses: actions/checkout@v4
+
+ - name: Docker meta
+ id: docker_meta
+ uses: docker/metadata-action@v5
+ with:
+ images: obsei/obsei-sdk
+
+ - name: Set up QEMU
+ uses: docker/setup-qemu-action@v3
+
+ - name: Set up Docker Buildx
+ uses: docker/setup-buildx-action@v3
+
+ - name: Login to DockerHub
+ if: github.event_name != 'pull_request'
+ uses: docker/login-action@v3
+ with:
+ username: ${{ secrets.DOCKERHUB_USERNAME }}
+ password: ${{ secrets.DOCKERHUB_TOKEN }}
+
+ - name: Build and push
+ uses: docker/build-push-action@v5
+ with:
+ context: ./
+ file: ./Dockerfile
+ push: ${{ github.event_name != 'pull_request' }}
+ tags: ${{ steps.docker_meta.outputs.tags }}
+ labels: ${{ steps.docker_meta.outputs.labels }}
+
+ - name: Image digest
+ run: echo ${{ steps.docker_build.outputs.digest }}
diff --git a/obsei_module/.github/workflows/ui_docker_publish.yml b/obsei_module/.github/workflows/ui_docker_publish.yml
new file mode 100644
index 0000000000000000000000000000000000000000..2fc690055b8d20c9462412352f2d75f8a6710447
--- /dev/null
+++ b/obsei_module/.github/workflows/ui_docker_publish.yml
@@ -0,0 +1,50 @@
+# This workflows will upload a Python Package using Twine when a release is created
+# For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries
+
+name: Publish UI Docker image
+
+on:
+ workflow_dispatch:
+ inputs:
+ tag:
+ description: 'Image tag'
+ required: true
+ release:
+ types: [published]
+
+jobs:
+ deploy-ui-docker:
+ runs-on: ubuntu-latest
+ steps:
+ - uses: actions/checkout@v4
+
+ - name: Docker meta
+ id: docker_meta
+ uses: docker/metadata-action@v5
+ with:
+ images: obsei/obsei-ui-demo
+
+ - name: Set up QEMU
+ uses: docker/setup-qemu-action@v3
+
+ - name: Set up Docker Buildx
+ uses: docker/setup-buildx-action@v3
+
+ - name: Login to DockerHub
+ if: github.event_name != 'pull_request'
+ uses: docker/login-action@v3
+ with:
+ username: ${{ secrets.DOCKERHUB_USERNAME }}
+ password: ${{ secrets.DOCKERHUB_TOKEN }}
+
+ - name: Build and push
+ uses: docker/build-push-action@v5
+ with:
+ context: "{{defaultContext}}:sample-ui"
+ file: Dockerfile
+ push: ${{ github.event_name != 'pull_request' }}
+ tags: ${{ steps.docker_meta.outputs.tags }}
+ labels: ${{ steps.docker_meta.outputs.labels }}
+
+ - name: Image digest
+ run: echo ${{ steps.docker_build.outputs.digest }}
diff --git a/obsei_module/.gitignore b/obsei_module/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..80dd90d9cb4c179a40e922c4a9482c3afe64a999
--- /dev/null
+++ b/obsei_module/.gitignore
@@ -0,0 +1,148 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+# Usually these files are written by a python script from a template
+# before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+# For a library or package, you might want to ignore these files since the code is
+# intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+# However, in case of collaboration, if having platform-specific dependencies or dependencies
+# having no cross-platform support, pipenv may install dependencies that don't work, or not
+# install all needed dependencies.
+#Pipfile.lock
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+/.idea/*
+*.db
+models*
+
+# OSX custom attributes
+.DS_Store
+
+# VS code configuration
+.vscode/*
diff --git a/obsei_module/.pre-commit-config.yaml b/obsei_module/.pre-commit-config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7631ed863745fe51f97e33d1b98b0aeb5ef43b70
--- /dev/null
+++ b/obsei_module/.pre-commit-config.yaml
@@ -0,0 +1,21 @@
+repos:
+ - repo: https://github.com/pre-commit/pre-commit-hooks
+ rev: v4.3.0
+ hooks:
+ - id: check-yaml
+ - id: trailing-whitespace
+ - id: requirements-txt-fixer
+ - id: end-of-file-fixer
+
+ - repo: https://github.com/psf/black
+ rev: 22.10.0
+ hooks:
+ - id: black
+
+ - repo: https://github.com/pre-commit/mirrors-mypy
+ rev: v0.991
+ hooks:
+ - id: mypy
+ args: [--ignore-missing-imports]
+ additional_dependencies: [types-all]
+ files: ^obsei/
diff --git a/obsei_module/.pyup.yml b/obsei_module/.pyup.yml
new file mode 100644
index 0000000000000000000000000000000000000000..b55ad548d5705a6c15d6f79192892e7612dbc2a3
--- /dev/null
+++ b/obsei_module/.pyup.yml
@@ -0,0 +1,5 @@
+# autogenerated pyup.io config file
+# see https://pyup.io/docs/configuration/ for all available options
+
+schedule: ''
+update: insecure
diff --git a/obsei_module/ATTRIBUTION.md b/obsei_module/ATTRIBUTION.md
new file mode 100644
index 0000000000000000000000000000000000000000..fc6f436d7be74b3ca7d9bbcdcd7d823fb52f7a2e
--- /dev/null
+++ b/obsei_module/ATTRIBUTION.md
@@ -0,0 +1,18 @@
+This could not have been possible without following open source software -
+- [searchtweets-v2](https://github.com/twitterdev/search-tweets-python): For Twitter's API v2 wrapper
+- [vaderSentiment](https://github.com/cjhutto/vaderSentiment): For rule-based sentiment analysis
+- [transformers](https://github.com/huggingface/transformers): For text-classification pipeline
+- [atlassian-python-api](https://github.com/atlassian-api/atlassian-python-api): To interact with Jira
+- [elasticsearch](https://github.com/elastic/elasticsearch-py): To interact with Elasticsearch
+- [pydantic](https://github.com/samuelcolvin/pydantic): For data validation
+- [sqlalchemy](https://github.com/sqlalchemy/sqlalchemy): As SQL toolkit to access DB storage
+- [google-play-scraper](https://github.com/JoMingyu/google-play-scraper): To fetch the Google Play Store review without authentication
+- [praw](https://github.com/praw-dev/praw): For Reddit client
+- [reddit-rss-reader](https://github.com/lalitpagaria/reddit-rss-reader): For Reddit scrapping
+- [app-store-reviews-reader](https://github.com/lalitpagaria/app_store_reviews_reader): For App Store reviews scrapping
+- [slack-sdk](https://github.com/slackapi/python-slack-sdk): For slack integration
+- [presidio-anonymizer](https://github.com/microsoft/presidio): Personal information anonymizer
+- [GoogleNews](https://github.com/Iceloof/GoogleNews): For Google News integration
+- [python-facebook-api](https://github.com/sns-sdks/python-facebook): For facebook integration
+- [youtube-comment-downloader](https://github.com/egbertbouman/youtube-comment-downloader): For Youtube video comments extraction code
+- [dateparser](https://github.com/scrapinghub/dateparser): To parse date properly (where format is ambiguous)
\ No newline at end of file
diff --git a/obsei_module/CITATION.cff b/obsei_module/CITATION.cff
new file mode 100644
index 0000000000000000000000000000000000000000..bd12a46b8dc05be975f138e2357ebef65de9ade3
--- /dev/null
+++ b/obsei_module/CITATION.cff
@@ -0,0 +1,14 @@
+# YAML 1.2
+---
+authors:
+ -
+ family-names: Pagaria
+ given-names: Lalit
+
+cff-version: "1.1.0"
+license: "Apache-2.0"
+message: "If you use this software, please cite it using this metadata."
+repository-code: "https://github.com/obsei/obsei"
+title: "Obsei - a low code AI powered automation tool"
+version: "0.0.10"
+...
diff --git a/obsei_module/CNAME b/obsei_module/CNAME
new file mode 100644
index 0000000000000000000000000000000000000000..48c4fb7ad825704db946a83e64693071ebe454d7
--- /dev/null
+++ b/obsei_module/CNAME
@@ -0,0 +1 @@
+www.obsei.com
\ No newline at end of file
diff --git a/obsei_module/CODE_OF_CONDUCT.md b/obsei_module/CODE_OF_CONDUCT.md
new file mode 100644
index 0000000000000000000000000000000000000000..e8c5ad02324a0fa0778f625fd77f183f3c531ff7
--- /dev/null
+++ b/obsei_module/CODE_OF_CONDUCT.md
@@ -0,0 +1,128 @@
+# Contributor Covenant Code of Conduct
+
+## Our Pledge
+
+We as members, contributors, and leaders pledge to make participation in our
+community a harassment-free experience for everyone, regardless of age, body
+size, visible or invisible disability, ethnicity, sex characteristics, gender
+identity and expression, level of experience, education, socio-economic status,
+nationality, personal appearance, race, religion, or sexual identity
+and orientation.
+
+We pledge to act and interact in ways that contribute to an open, welcoming,
+diverse, inclusive, and healthy community.
+
+## Our Standards
+
+Examples of behavior that contributes to a positive environment for our
+community include:
+
+- Demonstrating empathy and kindness toward other people
+- Being respectful of differing opinions, viewpoints, and experiences
+- Giving and gracefully accepting constructive feedback
+- Accepting responsibility and apologizing to those affected by our mistakes,
+ and learning from the experience
+- Focusing on what is best not just for us as individuals, but for the
+ overall community
+
+Examples of unacceptable behavior include:
+
+- The use of sexualized language or imagery, and sexual attention or
+ advances of any kind
+- Trolling, insulting or derogatory comments, and personal or political attacks
+- Public or private harassment
+- Publishing others' private information, such as a physical or email
+ address, without their explicit permission
+- Other conduct which could reasonably be considered inappropriate in a
+ professional setting
+
+## Enforcement Responsibilities
+
+Community leaders are responsible for clarifying and enforcing our standards of
+acceptable behavior and will take appropriate and fair corrective action in
+response to any behavior that they deem inappropriate, threatening, offensive,
+or harmful.
+
+Community leaders have the right and responsibility to remove, edit, or reject
+comments, commits, code, wiki edits, issues, and other contributions that are
+not aligned to this Code of Conduct, and will communicate reasons for moderation
+decisions when appropriate.
+
+## Scope
+
+This Code of Conduct applies within all community spaces, and also applies when
+an individual is officially representing the community in public spaces.
+Examples of representing our community include using an official e-mail address,
+posting via an official social media account, or acting as an appointed
+representative at an online or offline event.
+
+## Enforcement
+
+Instances of abusive, harassing, or otherwise unacceptable behavior may be
+reported to the community leaders responsible for enforcement at
+obsei.tool@gmail.com
+All complaints will be reviewed and investigated promptly and fairly.
+
+All community leaders are obligated to respect the privacy and security of the
+reporter of any incident.
+
+## Enforcement Guidelines
+
+Community leaders will follow these Community Impact Guidelines in determining
+the consequences for any action they deem in violation of this Code of Conduct:
+
+### 1. Correction
+
+**Community Impact**: Use of inappropriate language or other behavior deemed
+unprofessional or unwelcome in the community.
+
+**Consequence**: A written warning from community leaders, providing
+clarity around the nature of the violation and an explanation of why the
+behavior was inappropriate. A public apology may be requested.
+
+### 2. Warning
+
+**Community Impact**: A violation through a single incident or series
+of actions.
+
+**Consequence**: A warning with consequences for continued behavior. No
+interaction with the people involved, including unsolicited interaction with
+those enforcing the Code of Conduct, for a specified period of time. This
+includes avoiding interactions in community spaces as well as external channels
+like social media. Violating these terms may lead to a temporary or
+permanent ban.
+
+### 3. Temporary Ban
+
+**Community Impact**: A serious violation of community standards, including
+sustained inappropriate behavior.
+
+**Consequence**: A temporary ban from any sort of interaction or public
+communication with the community for a specified period of time. No public or
+private interaction with the people involved, including unsolicited interaction
+with those enforcing the Code of Conduct, is allowed during this period.
+Violating these terms may lead to a permanent ban.
+
+### 4. Permanent Ban
+
+**Community Impact**: Demonstrating a pattern of violation of community
+standards, including sustained inappropriate behavior, harassment of an
+individual, or aggression toward or disparagement of classes of individuals.
+
+**Consequence**: A permanent ban from any sort of public interaction within
+the community.
+
+## Attribution
+
+This Code of Conduct is adapted from the [Contributor Covenant][homepage],
+version 2.0, available at
+https://www.contributor-covenant.org/version/2/0/code_of_conduct.html.
+
+Community Impact Guidelines were inspired by [Mozilla's code of conduct
+enforcement ladder](https://github.com/mozilla/diversity).
+
+[homepage]: https://www.contributor-covenant.org
+
+For answers to common questions about this code of conduct, see the FAQ at
+https://www.contributor-covenant.org/faq. Translations are available at
+https://www.contributor-covenant.org/translations.
diff --git a/obsei_module/CONTRIBUTING.md b/obsei_module/CONTRIBUTING.md
new file mode 100644
index 0000000000000000000000000000000000000000..f0afbb0a9ac896f8afb37430e389450efe498926
--- /dev/null
+++ b/obsei_module/CONTRIBUTING.md
@@ -0,0 +1,103 @@
+# 👐 Contributing to Obsei
+
+First off, thank you for even considering contributing to this package, every contribution big or small is greatly appreciated.
+Community contributions are what keep projects like this fueled and constantly improving, so a big thanks to you!
+
+Below are some sections detailing the guidelines we'd like you to follow to make your contribution as seamless as possible.
+
+- [Code of Conduct](#coc)
+- [Asking a Question and Discussions](#question)
+- [Issues, Bugs, and Feature Requests](#issue)
+- [Submission Guidelines](#submit)
+- [Code Style and Formatting](#code)
+- [Contributor License Agreement](#cla)
+
+## 📜 Code of Conduct
+
+The [Code of Conduct](https://github.com/obsei/obsei/blob/master/CODE_OF_CONDUCT.md) applies within all community spaces.
+If you are not familiar with our Code of Conduct policy, take a minute to read the policy before starting with your first contribution.
+
+## 🗣️ Query or Discussion
+
+We would like to use [Github discussions](https://github.com/obsei/obsei/discussions) as the central hub for all
+community discussions, questions, and everything else in between. While Github discussions is a new service (as of 2021)
+we believe that it really helps keep this repo as one single source to find all relevant information. Our hope is that
+discussion page functions as a record of all the conversations that help contribute to the project's development.
+
+If you are new to [Github discussions](https://github.com/obsei/obsei/discussions) it is a very similar experience
+to Stack Overflow with an added element of general discussion and discourse rather than solely being question and answer based.
+
+## 🪲 Issues, Bugs, and Feature Requests
+
+We are very open to community contributions and appreciate anything that improves **Obsei**. This includes fixings typos, adding missing documentation, fixing bugs or adding new features.
+To avoid unnecessary work on either side, please stick to the following process:
+
+1. If you feel like your issue is not specific and more of a general question about a design decision, or algorithm implementation maybe start a [discussion](https://github.com/obsei/obsei/discussions) instead, this helps keep the issues less cluttered and encourages more open-ended conversation.
+2. Check if there is already [an related issue](https://github.com/obsei/obsei/issues).
+3. If there is not, open a new one to start a discussion. Some features might be a nice idea, but don't fit in the scope of Obsei and we hate to close finished PRs.
+4. If we came to the conclusion to move forward with your issue, we will be happy to accept a pull request. Make sure you create a pull request in an early draft version and ask for feedback.
+5. Verify that all tests in the CI pass (and add new ones if you implement anything new)
+
+See [below](#submit) for some guidelines.
+
+## ✉️ Submission Guidelines
+
+### Submitting an Issue
+
+Before you submit your issue search the archive, maybe your question was already answered.
+
+If your issue appears to be a bug, and hasn't been reported, open a new issue.
+Help us to maximize the effort we can spend fixing issues and adding new
+features, by not reporting duplicate issues. Providing the following information will increase the
+chances of your issue being dealt with quickly:
+
+- **Describe the bug** - A clear and concise description of what the bug is.
+- **To Reproduce**- Steps to reproduce the behavior.
+- **Expected behavior** - A clear and concise description of what you expected to happen.
+- **Environment**
+ - Obsei version
+ - Python version
+ - OS
+- **Suggest a Fix** - if you can't fix the bug yourself, perhaps you can point to what might be
+ causing the problem (line of code or commit)
+
+When you submit a PR you will be presented with a PR template, please fill this in as best you can.
+
+### Submitting a Pull Request
+
+Before you submit your pull request consider the following guidelines:
+
+- Search [GitHub](https://github.com/obsei/obsei/pulls) for an open or closed Pull Request
+ that relates to your submission. You don't want to duplicate effort.
+- Fork the main repo if not already done
+- Rebase fork with `upstream master`
+- Create new branch and add the changes in that branch
+- Add supporting test cases
+- Follow our [Coding Rules](#rules).
+- Avoid checking in files that shouldn't be tracked (e.g `dist`, `build`, `.tmp`, `.idea`).
+ We recommend using a [global](#global-gitignore) gitignore for this.
+- Before you commit please run the test suite and make sure all tests are passing.
+- Format your code appropriately:
+ - This package uses [black](https://black.readthedocs.io/en/stable/) as its formatter.
+ In order to format your code with black run `black . ` from the root of the package.
+- Run `pre-commit run --all-files` if you're adding new hooks to pre-commit config file. By default, pre-commit will run on modified files when commiting changes.
+- Commit your changes using a descriptive commit message.
+- In GitHub, send a pull request to `obsei:master`.
+- If we suggest changes then:
+ - Make the required updates.
+ - Rebase your branch and force push to your GitHub repository (this will update your Pull Request):
+
+That's it! Thank you for your contribution!
+
+## ✅ Coding Rules
+
+We generally follow the [Google Python style guide](http://google.github.io/styleguide/pyguide.html).
+
+## 📝 Contributor License Agreement
+
+That we do not have any potential problems later it is sadly necessary to sign a [Contributor License Agreement](CONTRIBUTOR_LICENSE_AGREEMENT.md). That can be done literally with the push of a button.
+
+---
+
+_This guide was inspired by the [transformers-interpret](https://github.com/cdpierse/transformers-interpret/blob/master/CONTRIBUTING.md),
+[Haystack](https://github.com/deepset-ai/haystack/blob/master/CONTRIBUTING.md) and [n8n](https://github.com/n8n-io/n8n/blob/master/CONTRIBUTOR_LICENSE_AGREEMENT.md)_
diff --git a/obsei_module/CONTRIBUTOR_LICENSE_AGREEMENT.md b/obsei_module/CONTRIBUTOR_LICENSE_AGREEMENT.md
new file mode 100644
index 0000000000000000000000000000000000000000..8b4784f57938ed30cbc0de319b9c90df121b3632
--- /dev/null
+++ b/obsei_module/CONTRIBUTOR_LICENSE_AGREEMENT.md
@@ -0,0 +1,3 @@
+# Obsei Contributor License Agreement
+
+I give Obsei's Creator permission to license my contributions to any terms they like. I am giving them this license in order to make it possible for them to accept my contributions into their project.
\ No newline at end of file
diff --git a/obsei_module/Dockerfile b/obsei_module/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..b16cbfd7580a3c384e93b690c80a8e4812d1a57f
--- /dev/null
+++ b/obsei_module/Dockerfile
@@ -0,0 +1,38 @@
+# This is Docker file to Obsei SDK with dependencies installed
+FROM python:3.10-slim-bullseye
+
+RUN useradd --create-home user
+WORKDIR /home/user
+
+# env variable
+ENV PIP_DISABLE_PIP_VERSION_CHECK 1
+ENV PIP_NO_CACHE_DIR 1
+ENV WORKFLOW_SCRIPT '/home/user/obsei/process_workflow.py'
+ENV OBSEI_CONFIG_PATH ""
+ENV OBSEI_CONFIG_FILENAME ""
+
+
+# Hack to install jre on debian
+RUN mkdir -p /usr/share/man/man1
+
+# install few required tools
+RUN apt-get update && apt-get install -y --no-install-recommends curl git pkg-config cmake libncurses5 g++ \
+ && apt-get clean autoclean && apt-get autoremove -y \
+ && rm -rf /var/lib/{apt,dpkg,cache,log}/
+
+# install as a package
+COPY pyproject.toml README.md /home/user/
+RUN pip install --upgrade pip
+
+# copy README
+COPY README.md /home/user/
+
+# copy code
+COPY obsei /home/user/obsei
+RUN pip install -e .[all]
+
+
+USER user
+
+# cmd for running the API
+CMD ["sh", "-c", "python ${WORKFLOW_SCRIPT}"]
diff --git a/obsei_module/LICENSE b/obsei_module/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..fecb6d71f505d183b3e4f5bbda806637c660d0f1
--- /dev/null
+++ b/obsei_module/LICENSE
@@ -0,0 +1,201 @@
+ Apache License
+ Version 2.0, January 2004
+ http://www.apache.org/licenses/
+
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+ 1. Definitions.
+
+ "License" shall mean the terms and conditions for use, reproduction,
+ and distribution as defined by Sections 1 through 9 of this document.
+
+ "Licensor" shall mean the copyright owner or entity authorized by
+ the copyright owner that is granting the License.
+
+ "Legal Entity" shall mean the union of the acting entity and all
+ other entities that control, are controlled by, or are under common
+ control with that entity. For the purposes of this definition,
+ "control" means (i) the power, direct or indirect, to cause the
+ direction or management of such entity, whether by contract or
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
+ outstanding shares, or (iii) beneficial ownership of such entity.
+
+ "You" (or "Your") shall mean an individual or Legal Entity
+ exercising permissions granted by this License.
+
+ "Source" form shall mean the preferred form for making modifications,
+ including but not limited to software source code, documentation
+ source, and configuration files.
+
+ "Object" form shall mean any form resulting from mechanical
+ transformation or translation of a Source form, including but
+ not limited to compiled object code, generated documentation,
+ and conversions to other media types.
+
+ "Work" shall mean the work of authorship, whether in Source or
+ Object form, made available under the License, as indicated by a
+ copyright notice that is included in or attached to the work
+ (an example is provided in the Appendix below).
+
+ "Derivative Works" shall mean any work, whether in Source or Object
+ form, that is based on (or derived from) the Work and for which the
+ editorial revisions, annotations, elaborations, or other modifications
+ represent, as a whole, an original work of authorship. For the purposes
+ of this License, Derivative Works shall not include works that remain
+ separable from, or merely link (or bind by name) to the interfaces of,
+ the Work and Derivative Works thereof.
+
+ "Contribution" shall mean any work of authorship, including
+ the original version of the Work and any modifications or additions
+ to that Work or Derivative Works thereof, that is intentionally
+ submitted to Licensor for inclusion in the Work by the copyright owner
+ or by an individual or Legal Entity authorized to submit on behalf of
+ the copyright owner. For the purposes of this definition, "submitted"
+ means any form of electronic, verbal, or written communication sent
+ to the Licensor or its representatives, including but not limited to
+ communication on electronic mailing lists, source code control systems,
+ and issue tracking systems that are managed by, or on behalf of, the
+ Licensor for the purpose of discussing and improving the Work, but
+ excluding communication that is conspicuously marked or otherwise
+ designated in writing by the copyright owner as "Not a Contribution."
+
+ "Contributor" shall mean Licensor and any individual or Legal Entity
+ on behalf of whom a Contribution has been received by Licensor and
+ subsequently incorporated within the Work.
+
+ 2. Grant of Copyright License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ copyright license to reproduce, prepare Derivative Works of,
+ publicly display, publicly perform, sublicense, and distribute the
+ Work and such Derivative Works in Source or Object form.
+
+ 3. Grant of Patent License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ (except as stated in this section) patent license to make, have made,
+ use, offer to sell, sell, import, and otherwise transfer the Work,
+ where such license applies only to those patent claims licensable
+ by such Contributor that are necessarily infringed by their
+ Contribution(s) alone or by combination of their Contribution(s)
+ with the Work to which such Contribution(s) was submitted. If You
+ institute patent litigation against any entity (including a
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
+ or a Contribution incorporated within the Work constitutes direct
+ or contributory patent infringement, then any patent licenses
+ granted to You under this License for that Work shall terminate
+ as of the date such litigation is filed.
+
+ 4. Redistribution. You may reproduce and distribute copies of the
+ Work or Derivative Works thereof in any medium, with or without
+ modifications, and in Source or Object form, provided that You
+ meet the following conditions:
+
+ (a) You must give any other recipients of the Work or
+ Derivative Works a copy of this License; and
+
+ (b) You must cause any modified files to carry prominent notices
+ stating that You changed the files; and
+
+ (c) You must retain, in the Source form of any Derivative Works
+ that You distribute, all copyright, patent, trademark, and
+ attribution notices from the Source form of the Work,
+ excluding those notices that do not pertain to any part of
+ the Derivative Works; and
+
+ (d) If the Work includes a "NOTICE" text file as part of its
+ distribution, then any Derivative Works that You distribute must
+ include a readable copy of the attribution notices contained
+ within such NOTICE file, excluding those notices that do not
+ pertain to any part of the Derivative Works, in at least one
+ of the following places: within a NOTICE text file distributed
+ as part of the Derivative Works; within the Source form or
+ documentation, if provided along with the Derivative Works; or,
+ within a display generated by the Derivative Works, if and
+ wherever such third-party notices normally appear. The contents
+ of the NOTICE file are for informational purposes only and
+ do not modify the License. You may add Your own attribution
+ notices within Derivative Works that You distribute, alongside
+ or as an addendum to the NOTICE text from the Work, provided
+ that such additional attribution notices cannot be construed
+ as modifying the License.
+
+ You may add Your own copyright statement to Your modifications and
+ may provide additional or different license terms and conditions
+ for use, reproduction, or distribution of Your modifications, or
+ for any such Derivative Works as a whole, provided Your use,
+ reproduction, and distribution of the Work otherwise complies with
+ the conditions stated in this License.
+
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
+ any Contribution intentionally submitted for inclusion in the Work
+ by You to the Licensor shall be under the terms and conditions of
+ this License, without any additional terms or conditions.
+ Notwithstanding the above, nothing herein shall supersede or modify
+ the terms of any separate license agreement you may have executed
+ with Licensor regarding such Contributions.
+
+ 6. Trademarks. This License does not grant permission to use the trade
+ names, trademarks, service marks, or product names of the Licensor,
+ except as required for reasonable and customary use in describing the
+ origin of the Work and reproducing the content of the NOTICE file.
+
+ 7. Disclaimer of Warranty. Unless required by applicable law or
+ agreed to in writing, Licensor provides the Work (and each
+ Contributor provides its Contributions) on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ implied, including, without limitation, any warranties or conditions
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+ PARTICULAR PURPOSE. You are solely responsible for determining the
+ appropriateness of using or redistributing the Work and assume any
+ risks associated with Your exercise of permissions under this License.
+
+ 8. Limitation of Liability. In no event and under no legal theory,
+ whether in tort (including negligence), contract, or otherwise,
+ unless required by applicable law (such as deliberate and grossly
+ negligent acts) or agreed to in writing, shall any Contributor be
+ liable to You for damages, including any direct, indirect, special,
+ incidental, or consequential damages of any character arising as a
+ result of this License or out of the use or inability to use the
+ Work (including but not limited to damages for loss of goodwill,
+ work stoppage, computer failure or malfunction, or any and all
+ other commercial damages or losses), even if such Contributor
+ has been advised of the possibility of such damages.
+
+ 9. Accepting Warranty or Additional Liability. While redistributing
+ the Work or Derivative Works thereof, You may choose to offer,
+ and charge a fee for, acceptance of support, warranty, indemnity,
+ or other liability obligations and/or rights consistent with this
+ License. However, in accepting such obligations, You may act only
+ on Your own behalf and on Your sole responsibility, not on behalf
+ of any other Contributor, and only if You agree to indemnify,
+ defend, and hold each Contributor harmless for any liability
+ incurred by, or claims asserted against, such Contributor by reason
+ of your accepting any such warranty or additional liability.
+
+ END OF TERMS AND CONDITIONS
+
+ APPENDIX: How to apply the Apache License to your work.
+
+ To apply the Apache License to your work, attach the following
+ boilerplate notice, with the fields enclosed by brackets "[]"
+ replaced with your own identifying information. (Don't include
+ the brackets!) The text should be enclosed in the appropriate
+ comment syntax for the file format. We also recommend that a
+ file or class name and description of purpose be included on the
+ same "printed page" as the copyright notice for easier
+ identification within third-party archives.
+
+ Copyright 2020-2022 Oraika Technologies Private Limited (https://www.oraika.com)
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
diff --git a/obsei_module/MANIFEST.in b/obsei_module/MANIFEST.in
new file mode 100644
index 0000000000000000000000000000000000000000..84c71247ce333d3b19e1265f4da3fd130972bc35
--- /dev/null
+++ b/obsei_module/MANIFEST.in
@@ -0,0 +1,3 @@
+include LICENSE
+include requirements.txt
+include README.md
diff --git a/obsei_module/README.md b/obsei_module/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..48c602f538183d2bba5f019e2c81cb32946cb71b
--- /dev/null
+++ b/obsei_module/README.md
@@ -0,0 +1,1067 @@
+
+
+---
+
+![](https://raw.githubusercontent.com/obsei/obsei-resources/master/gifs/obsei_flow.gif)
+
+---
+
+
+Note: Obsei is still in alpha stage hence carefully use it in Production. Also, as it is constantly undergoing development hence master branch may contain many breaking changes. Please use released version.
+
+
+---
+
+**Obsei** (pronounced "Ob see" | /əb-'sē/) is an open-source, low-code, AI powered automation tool. _Obsei_ consists of -
+
+- **Observer**: Collect unstructured data from various sources like tweets from Twitter, Subreddit comments on Reddit, page post's comments from Facebook, App Stores reviews, Google reviews, Amazon reviews, News, Website, etc.
+- **Analyzer**: Analyze unstructured data collected with various AI tasks like classification, sentiment analysis, translation, PII, etc.
+- **Informer**: Send analyzed data to various destinations like ticketing platforms, data storage, dataframe, etc so that the user can take further actions and perform analysis on the data.
+
+All the Observers can store their state in databases (Sqlite, Postgres, MySQL, etc.), making Obsei suitable for scheduled jobs or serverless applications.
+
+![Obsei diagram](https://raw.githubusercontent.com/obsei/obsei-resources/master/images/Obsei_diagram.png)
+
+### Future direction -
+
+- Text, Image, Audio, Documents and Video oriented workflows
+- Collect data from every possible private and public channels
+- Add every possible workflow to an AI downstream application to automate manual cognitive workflows
+
+## Use cases
+
+_Obsei_ use cases are following, but not limited to -
+
+- Social listening: Listening about social media posts, comments, customer feedback, etc.
+- Alerting/Notification: To get auto-alerts for events such as customer complaints, qualified sales leads, etc.
+- Automatic customer issue creation based on customer complaints on Social Media, Email, etc.
+- Automatic assignment of proper tags to tickets based content of customer complaint for example login issue, sign up issue, delivery issue, etc.
+- Extraction of deeper insight from feedbacks on various platforms
+- Market research
+- Creation of dataset for various AI tasks
+- Many more based on creativity 💡
+
+## Installation
+
+### Prerequisite
+
+Install the following (if not present already) -
+
+- Install [Python 3.7+](https://www.python.org/downloads/)
+- Install [PIP](https://pip.pypa.io/en/stable/installing/)
+
+### Install Obsei
+
+You can install Obsei either via PIP or Conda based on your preference.
+To install latest released version -
+
+```shell
+pip install obsei[all]
+```
+
+Install from master branch (if you want to try the latest features) -
+
+```shell
+git clone https://github.com/obsei/obsei.git
+cd obsei
+pip install --editable .[all]
+```
+
+Note: `all` option will install all the dependencies which might not be needed for your workflow, alternatively
+following options are available to install minimal dependencies as per need -
+ - `pip install obsei[source]`: To install dependencies related to all observers
+ - `pip install obsei[sink]`: To install dependencies related to all informers
+ - `pip install obsei[analyzer]`: To install dependencies related to all analyzers, it will install pytorch as well
+ - `pip install obsei[twitter-api]`: To install dependencies related to Twitter observer
+ - `pip install obsei[google-play-scraper]`: To install dependencies related to Play Store review scrapper observer
+ - `pip install obsei[google-play-api]`: To install dependencies related to Google official play store review API based observer
+ - `pip install obsei[app-store-scraper]`: To install dependencies related to Apple App Store review scrapper observer
+ - `pip install obsei[reddit-scraper]`: To install dependencies related to Reddit post and comment scrapper observer
+ - `pip install obsei[reddit-api]`: To install dependencies related to Reddit official api based observer
+ - `pip install obsei[pandas]`: To install dependencies related to TSV/CSV/Pandas based observer and informer
+ - `pip install obsei[google-news-scraper]`: To install dependencies related to Google news scrapper observer
+ - `pip install obsei[facebook-api]`: To install dependencies related to Facebook official page post and comments api based observer
+ - `pip install obsei[atlassian-api]`: To install dependencies related to Jira official api based informer
+ - `pip install obsei[elasticsearch]`: To install dependencies related to elasticsearch informer
+ - `pip install obsei[slack-api]`:To install dependencies related to Slack official api based informer
+
+You can also mix multiple dependencies together in single installation command. For example to install dependencies
+Twitter observer, all analyzer, and Slack informer use following command -
+```shell
+pip install obsei[twitter-api, analyzer, slack-api]
+```
+
+
+## How to use
+
+Expand the following steps and create a workflow -
+
+Step 1: Configure Source/Observer
+
+
+
Twitter
+
+```python
+from obsei.source.twitter_source import TwitterCredentials, TwitterSource, TwitterSourceConfig
+
+# initialize twitter source config
+source_config = TwitterSourceConfig(
+ keywords=["issue"], # Keywords, @user or #hashtags
+ lookup_period="1h", # Lookup period from current time, format: `` (day|hour|minute)
+ cred_info=TwitterCredentials(
+ # Enter your twitter consumer key and secret. Get it from https://developer.twitter.com/en/apply-for-access
+ consumer_key="",
+ consumer_secret="",
+ bearer_token='',
+ )
+)
+
+# initialize tweets retriever
+source = TwitterSource()
+```
+
+
+
+
+
+
Youtube Scrapper
+
+```python
+from obsei.source.youtube_scrapper import YoutubeScrapperSource, YoutubeScrapperConfig
+
+# initialize Youtube source config
+source_config = YoutubeScrapperConfig(
+ video_url="https://www.youtube.com/watch?v=uZfns0JIlFk", # Youtube video URL
+ fetch_replies=True, # Fetch replies to comments
+ max_comments=10, # Total number of comments and replies to fetch
+ lookup_period="1Y", # Lookup period from current time, format: `` (day|hour|minute|month|year)
+)
+
+# initialize Youtube comments retriever
+source = YoutubeScrapperSource()
+```
+
+
+
+
+
+
Facebook
+
+```python
+from obsei.source.facebook_source import FacebookCredentials, FacebookSource, FacebookSourceConfig
+
+# initialize facebook source config
+source_config = FacebookSourceConfig(
+ page_id="110844591144719", # Facebook page id, for example this one for Obsei
+ lookup_period="1h", # Lookup period from current time, format: `` (day|hour|minute)
+ cred_info=FacebookCredentials(
+ # Enter your facebook app_id, app_secret and long_term_token. Get it from https://developers.facebook.com/apps/
+ app_id="",
+ app_secret="",
+ long_term_token="",
+ )
+)
+
+# initialize facebook post comments retriever
+source = FacebookSource()
+```
+
+
+
+
+
+
Email
+
+```python
+from obsei.source.email_source import EmailConfig, EmailCredInfo, EmailSource
+
+# initialize email source config
+source_config = EmailConfig(
+ # List of IMAP servers for most commonly used email providers
+ # https://www.systoolsgroup.com/imap/
+ # Also, if you're using a Gmail account then make sure you allow less secure apps on your account -
+ # https://myaccount.google.com/lesssecureapps?pli=1
+ # Also enable IMAP access -
+ # https://mail.google.com/mail/u/0/#settings/fwdandpop
+ imap_server="imap.gmail.com", # Enter IMAP server
+ cred_info=EmailCredInfo(
+ # Enter your email account username and password
+ username="",
+ password=""
+ ),
+ lookup_period="1h" # Lookup period from current time, format: `` (day|hour|minute)
+)
+
+# initialize email retriever
+source = EmailSource()
+```
+
+
+
+
+
+
Google Maps Reviews Scrapper
+
+```python
+from obsei.source.google_maps_reviews import OSGoogleMapsReviewsSource, OSGoogleMapsReviewsConfig
+
+# initialize Outscrapper Maps review source config
+source_config = OSGoogleMapsReviewsConfig(
+ # Collect API key from https://outscraper.com/
+ api_key="",
+ # Enter Google Maps link or place id
+ # For example below is for the "Taj Mahal"
+ queries=["https://www.google.co.in/maps/place/Taj+Mahal/@27.1751496,78.0399535,17z/data=!4m5!3m4!1s0x39747121d702ff6d:0xdd2ae4803f767dde!8m2!3d27.1751448!4d78.0421422"],
+ number_of_reviews=10,
+)
+
+
+# initialize Outscrapper Maps review retriever
+source = OSGoogleMapsReviewsSource()
+```
+
+
+
+
+
+
AppStore Reviews Scrapper
+
+```python
+from obsei.source.appstore_scrapper import AppStoreScrapperConfig, AppStoreScrapperSource
+
+# initialize app store source config
+source_config = AppStoreScrapperConfig(
+ # Need two parameters app_id and country.
+ # `app_id` can be found at the end of the url of app in app store.
+ # For example - https://apps.apple.com/us/app/xcode/id497799835
+ # `310633997` is the app_id for xcode and `us` is country.
+ countries=["us"],
+ app_id="310633997",
+ lookup_period="1h" # Lookup period from current time, format: `` (day|hour|minute)
+)
+
+
+# initialize app store reviews retriever
+source = AppStoreScrapperSource()
+```
+
+
+
+
+
+
Play Store Reviews Scrapper
+
+```python
+from obsei.source.playstore_scrapper import PlayStoreScrapperConfig, PlayStoreScrapperSource
+
+# initialize play store source config
+source_config = PlayStoreScrapperConfig(
+ # Need two parameters package_name and country.
+ # `package_name` can be found at the end of the url of app in play store.
+ # For example - https://play.google.com/store/apps/details?id=com.google.android.gm&hl=en&gl=US
+ # `com.google.android.gm` is the package_name for xcode and `us` is country.
+ countries=["us"],
+ package_name="com.google.android.gm",
+ lookup_period="1h" # Lookup period from current time, format: `` (day|hour|minute)
+)
+
+# initialize play store reviews retriever
+source = PlayStoreScrapperSource()
+```
+
+
+
+
+
+
Reddit
+
+```python
+from obsei.source.reddit_source import RedditConfig, RedditSource, RedditCredInfo
+
+# initialize reddit source config
+source_config = RedditConfig(
+ subreddits=["wallstreetbets"], # List of subreddits
+ # Reddit account username and password
+ # You can also enter reddit client_id and client_secret or refresh_token
+ # Create credential at https://www.reddit.com/prefs/apps
+ # Also refer https://praw.readthedocs.io/en/latest/getting_started/authentication.html
+ # Currently Password Flow, Read Only Mode and Saved Refresh Token Mode are supported
+ cred_info=RedditCredInfo(
+ username="",
+ password=""
+ ),
+ lookup_period="1h" # Lookup period from current time, format: `` (day|hour|minute)
+)
+
+# initialize reddit retriever
+source = RedditSource()
+```
+
+
+
+
+
+
Reddit Scrapper
+
+Note: Reddit heavily rate limit scrappers, hence use it to fetch small data during long period
+
+```python
+from obsei.source.reddit_scrapper import RedditScrapperConfig, RedditScrapperSource
+
+# initialize reddit scrapper source config
+source_config = RedditScrapperConfig(
+ # Reddit subreddit, search etc rss url. For proper url refer following link -
+ # Refer https://www.reddit.com/r/pathogendavid/comments/tv8m9/pathogendavids_guide_to_rss_and_reddit/
+ url="https://www.reddit.com/r/wallstreetbets/comments/.rss?sort=new",
+ lookup_period="1h" # Lookup period from current time, format: `` (day|hour|minute)
+)
+
+# initialize reddit retriever
+source = RedditScrapperSource()
+```
+
+
+
+
+
+
Google News
+
+```python
+from obsei.source.google_news_source import GoogleNewsConfig, GoogleNewsSource
+
+# initialize Google News source config
+source_config = GoogleNewsConfig(
+ query='bitcoin',
+ max_results=5,
+ # To fetch full article text enable `fetch_article` flag
+ # By default google news gives title and highlight
+ fetch_article=True,
+ # proxy='http://127.0.0.1:8080'
+)
+
+# initialize Google News retriever
+source = GoogleNewsSource()
+```
+
+
+
Pandas DataFrame
+
+```python
+import pandas as pd
+from obsei.source.pandas_source import PandasSource, PandasSourceConfig
+
+# Initialize your Pandas DataFrame from your sources like csv, excel, sql etc
+# In following example we are reading csv which have two columns title and text
+csv_file = "https://raw.githubusercontent.com/deepset-ai/haystack/master/tutorials/small_generator_dataset.csv"
+dataframe = pd.read_csv(csv_file)
+
+# initialize pandas sink config
+sink_config = PandasSourceConfig(
+ dataframe=dataframe,
+ include_columns=["score"],
+ text_columns=["name", "degree"],
+)
+
+# initialize pandas sink
+sink = PandasSource()
+```
+
+
+
+
+
+
+
+
+
+Step 2: Configure Analyzer
+
+Note: To run transformers in an offline mode, check [transformers offline mode](https://huggingface.co/transformers/installation.html#offline-mode).
+
+
Some analyzer support GPU and to utilize pass device parameter.
+List of possible values of device parameter (default value auto):
+
+
auto: GPU (cuda:0) will be used if available otherwise CPU will be used
+
cpu: CPU will be used
+
cuda:{id} - GPU will be used with provided CUDA device id
+
+
+
+
+
Text Classification
+
+Text classification: Classify text into user provided categories.
+
+```python
+from obsei.analyzer.classification_analyzer import ClassificationAnalyzerConfig, ZeroShotClassificationAnalyzer
+
+# initialize classification analyzer config
+# It can also detect sentiments if "positive" and "negative" labels are added.
+analyzer_config=ClassificationAnalyzerConfig(
+ labels=["service", "delay", "performance"],
+)
+
+# initialize classification analyzer
+# For supported models refer https://huggingface.co/models?filter=zero-shot-classification
+text_analyzer = ZeroShotClassificationAnalyzer(
+ model_name_or_path="typeform/mobilebert-uncased-mnli",
+ device="auto"
+)
+```
+
+
+
+
+
+
Sentiment Analyzer
+
+Sentiment Analyzer: Detect the sentiment of the text. Text classification can also perform sentiment analysis but if you don't want to use heavy-duty NLP model then use less resource hungry dictionary based Vader Sentiment detector.
+
+```python
+from obsei.analyzer.sentiment_analyzer import VaderSentimentAnalyzer
+
+# Vader does not need any configuration settings
+analyzer_config=None
+
+# initialize vader sentiment analyzer
+text_analyzer = VaderSentimentAnalyzer()
+```
+
+
+
+
+
+
NER Analyzer
+
+NER (Named-Entity Recognition) Analyzer: Extract information and classify named entities mentioned in text into pre-defined categories such as person names, organizations, locations, medical codes, time expressions, quantities, monetary values, percentages, etc
+
+```python
+from obsei.analyzer.ner_analyzer import NERAnalyzer
+
+# NER analyzer does not need configuration settings
+analyzer_config=None
+
+# initialize ner analyzer
+# For supported models refer https://huggingface.co/models?filter=token-classification
+text_analyzer = NERAnalyzer(
+ model_name_or_path="elastic/distilbert-base-cased-finetuned-conll03-english",
+ device = "auto"
+)
+```
+
+
+
PII Anonymizer
+
+```python
+from obsei.analyzer.pii_analyzer import PresidioEngineConfig, PresidioModelConfig, \
+ PresidioPIIAnalyzer, PresidioPIIAnalyzerConfig
+
+# initialize pii analyzer's config
+analyzer_config = PresidioPIIAnalyzerConfig(
+ # Whether to return only pii analysis or anonymize text
+ analyze_only=False,
+ # Whether to return detail information about anonymization decision
+ return_decision_process=True
+)
+
+# initialize pii analyzer
+analyzer = PresidioPIIAnalyzer(
+ engine_config=PresidioEngineConfig(
+ # spacy and stanza nlp engines are supported
+ # For more info refer
+ # https://microsoft.github.io/presidio/analyzer/developing_recognizers/#utilize-spacy-or-stanza
+ nlp_engine_name="spacy",
+ # Update desired spacy model and language
+ models=[PresidioModelConfig(model_name="en_core_web_lg", lang_code="en")]
+ )
+)
+```
+
+
+
+
+
+
Dummy Analyzer
+
+Dummy Analyzer: Does nothing. Its simply used for transforming the input (TextPayload) to output (TextPayload) and adding the user supplied dummy data.
+
+```python
+from obsei.analyzer.dummy_analyzer import DummyAnalyzer, DummyAnalyzerConfig
+
+# initialize dummy analyzer's configuration settings
+analyzer_config = DummyAnalyzerConfig()
+
+# initialize dummy analyzer
+analyzer = DummyAnalyzer()
+```
+
+
+
+
+
+
+
+
+
+Step 3: Configure Sink/Informer
+
+
+
Slack
+
+```python
+from obsei.sink.slack_sink import SlackSink, SlackSinkConfig
+
+# initialize slack sink config
+sink_config = SlackSinkConfig(
+ # Provide slack bot/app token
+ # For more detail refer https://slack.com/intl/en-de/help/articles/215770388-Create-and-regenerate-API-tokens
+ slack_token="",
+ # To get channel id refer https://stackoverflow.com/questions/40940327/what-is-the-simplest-way-to-find-a-slack-team-id-and-a-channel-id
+ channel_id="C01LRS6CT9Q"
+)
+
+# initialize slack sink
+sink = SlackSink()
+```
+
+
+
+
+
+
Zendesk
+
+```python
+from obsei.sink.zendesk_sink import ZendeskSink, ZendeskSinkConfig, ZendeskCredInfo
+
+# initialize zendesk sink config
+sink_config = ZendeskSinkConfig(
+ # provide zendesk domain
+ domain="zendesk.com",
+ # provide subdomain if you have one
+ subdomain=None,
+ # Enter zendesk user details
+ cred_info=ZendeskCredInfo(
+ email="",
+ password=""
+ )
+)
+
+# initialize zendesk sink
+sink = ZendeskSink()
+```
+
+
+
+
+
+
Jira
+
+```python
+from obsei.sink.jira_sink import JiraSink, JiraSinkConfig
+
+# For testing purpose you can start jira server locally
+# Refer https://developer.atlassian.com/server/framework/atlassian-sdk/atlas-run-standalone/
+
+# initialize Jira sink config
+sink_config = JiraSinkConfig(
+ url="http://localhost:2990/jira", # Jira server url
+ # Jira username & password for user who have permission to create issue
+ username="",
+ password="",
+ # Which type of issue to be created
+ # For more information refer https://support.atlassian.com/jira-cloud-administration/docs/what-are-issue-types/
+ issue_type={"name": "Task"},
+ # Under which project issue to be created
+ # For more information refer https://support.atlassian.com/jira-software-cloud/docs/what-is-a-jira-software-project/
+ project={"key": "CUS"},
+)
+
+# initialize Jira sink
+sink = JiraSink()
+```
+
+
+
+
+
+
ElasticSearch
+
+```python
+from obsei.sink.elasticsearch_sink import ElasticSearchSink, ElasticSearchSinkConfig
+
+# For testing purpose you can start Elasticsearch server locally via docker
+# `docker run -d --name elasticsearch -p 9200:9200 -e "discovery.type=single-node" elasticsearch:8.5.0`
+
+# initialize Elasticsearch sink config
+sink_config = ElasticSearchSinkConfig(
+ # Elasticsearch server
+ hosts="http://localhost:9200",
+ # Index name, it will create if not exist
+ index_name="test",
+)
+
+# initialize Elasticsearch sink
+sink = ElasticSearchSink()
+```
+
+
+
+
+
+
Http
+
+```python
+from obsei.sink.http_sink import HttpSink, HttpSinkConfig
+
+# For testing purpose you can create mock http server via postman
+# For more details refer https://learning.postman.com/docs/designing-and-developing-your-api/mocking-data/setting-up-mock/
+
+# initialize http sink config (Currently only POST call is supported)
+sink_config = HttpSinkConfig(
+ # provide http server url
+ url="https://localhost:8080/api/path",
+ # Here you can add headers you would like to pass with request
+ headers={
+ "Content-type": "application/json"
+ }
+)
+
+# To modify or converting the payload, create convertor class
+# Refer obsei.sink.dailyget_sink.PayloadConvertor for example
+
+# initialize http sink
+sink = HttpSink()
+```
+
+
+
+
+
+
+Step 4: Join and create workflow
+
+`source` will fetch data from the selected source, then feed it to the `analyzer` for processing, whose output we feed into a `sink` to get notified at that sink.
+
+```python
+# Uncomment if you want logger
+# import logging
+# import sys
+# logger = logging.getLogger(__name__)
+# logging.basicConfig(stream=sys.stdout, level=logging.INFO)
+
+# This will fetch information from configured source ie twitter, app store etc
+source_response_list = source.lookup(source_config)
+
+# Uncomment if you want to log source response
+# for idx, source_response in enumerate(source_response_list):
+# logger.info(f"source_response#'{idx}'='{source_response.__dict__}'")
+
+# This will execute analyzer (Sentiment, classification etc) on source data with provided analyzer_config
+analyzer_response_list = text_analyzer.analyze_input(
+ source_response_list=source_response_list,
+ analyzer_config=analyzer_config
+)
+
+# Uncomment if you want to log analyzer response
+# for idx, an_response in enumerate(analyzer_response_list):
+# logger.info(f"analyzer_response#'{idx}'='{an_response.__dict__}'")
+
+# Analyzer output added to segmented_data
+# Uncomment to log it
+# for idx, an_response in enumerate(analyzer_response_list):
+# logger.info(f"analyzed_data#'{idx}'='{an_response.segmented_data.__dict__}'")
+
+# This will send analyzed output to configure sink ie Slack, Zendesk etc
+sink_response_list = sink.send_data(analyzer_response_list, sink_config)
+
+# Uncomment if you want to log sink response
+# for sink_response in sink_response_list:
+# if sink_response is not None:
+# logger.info(f"sink_response='{sink_response}'")
+```
+
+
+
+Step 5: Execute workflow
+Copy the code snippets from Steps 1 to 4 into a python file, for example example.py and execute the following command -
+
+```shell
+python example.py
+```
+
+
+
+## Demo
+
+We have a minimal [streamlit](https://streamlit.io/) based UI that you can use to test Obsei.
+
+![Screenshot](https://raw.githubusercontent.com/obsei/obsei-resources/master/images/obsei-ui-demo.png)
+
+### Watch UI demo video
+
+[![Introductory and demo video](https://img.youtube.com/vi/GTF-Hy96gvY/2.jpg)](https://www.youtube.com/watch?v=GTF-Hy96gvY)
+
+Check demo at [![](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue)](https://huggingface.co/spaces/obsei/obsei-demo)
+
+(**Note**: Sometimes the Streamlit demo might not work due to rate limiting, use the docker image (locally) in such cases.)
+
+To test locally, just run
+
+```
+docker run -d --name obesi-ui -p 8501:8501 obsei/obsei-ui-demo
+
+# You can find the UI at http://localhost:8501
+```
+
+**To run Obsei workflow easily using GitHub Actions (no sign ups and cloud hosting required), refer to this [repo](https://github.com/obsei/demo-workflow-action)**.
+
+## Companies/Projects using Obsei
+
+Here are some companies/projects (alphabetical order) using Obsei. To add your company/project to the list, please raise a PR or contact us via [email](contact@obsei.com).
+
+- [Oraika](https://www.oraika.com): Contextually understand customer feedback
+- [1Page](https://www.get1page.com/): Giving a better context in meetings and calls
+- [Spacepulse](http://spacepulse.in/): The operating system for spaces
+- [Superblog](https://superblog.ai/): A blazing fast alternative to WordPress and Medium
+- [Zolve](https://zolve.com/): Creating a financial world beyond borders
+- [Utilize](https://www.utilize.app/): No-code app builder for businesses with a deskless workforce
+
+## Articles
+
+
Observe app reviews from Google play store, PreProcess text via various text cleaning functions, Analyze them by performing text classification, Inform them to Pandas DataFrame and store resultant CSV to Google Drive
+
+
+
PlayStore Reviews → PreProcessing → Classification → Pandas DataFrame → CSV in Google Drive
Observe app reviews from Apple app store, PreProcess text via various text cleaning function, Analyze them by performing text classification, Inform them to Pandas DataFrame and store resultant CSV to Google Drive
+
+
+
AppStore Reviews → PreProcessing → Classification → Pandas DataFrame → CSV in Google Drive
Observe news article from Google news, PreProcess text via various text cleaning function, Analyze them via performing text classification while splitting text in small chunks and later computing final inference using given formula
+
+
+
Google News → Text Cleaner → Text Splitter → Classification → Inference Aggregator
+
+💡Tips: Handle large text classification via Obsei
+
+![](https://raw.githubusercontent.com/obsei/obsei-resources/master/gifs/Long_Text_Classification.gif)
+
+
+
+## Documentation
+
+For detailed installation instructions, usages and examples, refer to our [documentation](https://obsei.github.io/obsei/).
+
+## Support and Release Matrix
+
+
+
+
+
+
Linux
+
Mac
+
Windows
+
Remark
+
+
+
+
+
Tests
+
✅
+
✅
+
✅
+
Low Coverage as difficult to test 3rd party libs
+
+
+
PIP
+
✅
+
✅
+
✅
+
Fully Supported
+
+
+
Conda
+
❌
+
❌
+
❌
+
Not Supported
+
+
+
+
+## Discussion forum
+
+Discussion about _Obsei_ can be done at [community forum](https://github.com/obsei/obsei/discussions)
+
+## Changelogs
+
+Refer [releases](https://github.com/obsei/obsei/releases) for changelogs
+
+## Security Issue
+
+For any security issue please contact us via [email](mailto:contact@oraika.com)
+
+## Stargazers over time
+
+[![Stargazers over time](https://starchart.cc/obsei/obsei.svg)](https://starchart.cc/obsei/obsei)
+
+## Maintainers
+
+This project is being maintained by [Oraika Technologies](https://www.oraika.com). [Lalit Pagaria](https://github.com/lalitpagaria) and [Girish Patel](https://github.com/GirishPatel) are maintainers of this project.
+
+## License
+
+- Copyright holder: [Oraika Technologies](https://www.oraika.com)
+- Overall Apache 2.0 and you can read [License](https://github.com/obsei/obsei/blob/master/LICENSE) file.
+- Multiple other secondary permissive or weak copyleft licenses (LGPL, MIT, BSD etc.) for third-party components refer [Attribution](https://github.com/obsei/obsei/blob/master/ATTRIBUTION.md).
+- To make project more commercial friendly, we void third party components which have strong copyleft licenses (GPL, AGPL etc.) into the project.
+
+## Attribution
+
+This could not have been possible without these [open source softwares](https://github.com/obsei/obsei/blob/master/ATTRIBUTION.md).
+
+## Contribution
+
+First off, thank you for even considering contributing to this package, every contribution big or small is greatly appreciated.
+Please refer our [Contribution Guideline](https://github.com/obsei/obsei/blob/master/CONTRIBUTING.md) and [Code of Conduct](https://github.com/obsei/obsei/blob/master/CODE_OF_CONDUCT.md).
+
+Thanks so much to all our contributors
+
+
+
+
diff --git a/obsei_module/SECURITY.md b/obsei_module/SECURITY.md
new file mode 100644
index 0000000000000000000000000000000000000000..40ce33e3996ab24222f9c236fe167128c507ed6e
--- /dev/null
+++ b/obsei_module/SECURITY.md
@@ -0,0 +1,5 @@
+# Security Policy
+
+## Reporting a Vulnerability
+
+For any security issue please report it via [email](mailto:contact@oraika.com).
diff --git a/obsei_module/__init__.py b/obsei_module/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/obsei_module/__pycache__/__init__.cpython-311.pyc b/obsei_module/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1356d2f750a409d6caf759f2ea6baed2abfed9cf
Binary files /dev/null and b/obsei_module/__pycache__/__init__.cpython-311.pyc differ
diff --git a/obsei_module/_config.yml b/obsei_module/_config.yml
new file mode 100644
index 0000000000000000000000000000000000000000..0b55420d431480b1c3f2d4515c45b47c2e0625df
--- /dev/null
+++ b/obsei_module/_config.yml
@@ -0,0 +1,9 @@
+theme: jekyll-theme-primer
+markdown: CommonMarkGhPages
+commonmark:
+ options: ["UNSAFE", "SMART", "FOOTNOTES"]
+ extensions: ["strikethrough", "autolink", "table", "tagfilter"]
+title: "Obsei: An open-source low-code AI powered automation tool"
+description: "Obsei is an open-source low-code AI powered automation tool"
+
+google_analytics: G-0E2FTKBK4T
diff --git a/obsei_module/_includes/head-custom-google-analytics.html b/obsei_module/_includes/head-custom-google-analytics.html
new file mode 100644
index 0000000000000000000000000000000000000000..360ca261d4caea0b2597b4d53b2e95605b341b86
--- /dev/null
+++ b/obsei_module/_includes/head-custom-google-analytics.html
@@ -0,0 +1,9 @@
+
+
+
diff --git a/obsei_module/binder/requirements.txt b/obsei_module/binder/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..c436e37c0702f46f8deb36b9deed2d3fb8491296
--- /dev/null
+++ b/obsei_module/binder/requirements.txt
@@ -0,0 +1,2 @@
+git+https://github.com/obsei/obsei@master#egg=obsei[all]
+trafilatura
diff --git a/obsei_module/example/app_store_scrapper_example.py b/obsei_module/example/app_store_scrapper_example.py
new file mode 100644
index 0000000000000000000000000000000000000000..fcbf9bb1a4a24527319dda4b130a41f7cb12f549
--- /dev/null
+++ b/obsei_module/example/app_store_scrapper_example.py
@@ -0,0 +1,41 @@
+import logging
+import sys
+from datetime import datetime, timedelta
+
+import pytz
+
+from obsei.analyzer.classification_analyzer import ClassificationAnalyzerConfig, ZeroShotClassificationAnalyzer
+from obsei.misc.utils import DATETIME_STRING_PATTERN
+from obsei.source.appstore_scrapper import (
+ AppStoreScrapperConfig,
+ AppStoreScrapperSource,
+)
+
+logger = logging.getLogger(__name__)
+logging.basicConfig(stream=sys.stdout, level=logging.INFO)
+
+since_time = datetime.utcnow().astimezone(pytz.utc) + timedelta(days=-5)
+source_config = AppStoreScrapperConfig(
+ app_url='https://apps.apple.com/us/app/gmail-email-by-google/id422689480',
+ lookup_period=since_time.strftime(DATETIME_STRING_PATTERN),
+ max_count=10,
+)
+
+source = AppStoreScrapperSource()
+
+text_analyzer = ZeroShotClassificationAnalyzer(
+ model_name_or_path="typeform/mobilebert-uncased-mnli", device="auto"
+)
+
+source_response_list = source.lookup(source_config)
+for idx, source_response in enumerate(source_response_list):
+ logger.info(f"source_response#'{idx}'='{source_response.__dict__}'")
+
+analyzer_response_list = text_analyzer.analyze_input(
+ source_response_list=source_response_list,
+ analyzer_config=ClassificationAnalyzerConfig(
+ labels=["interface", "slow", "battery"],
+ ),
+)
+for idx, an_response in enumerate(analyzer_response_list):
+ logger.info(f"analyzer_response#'{idx}'='{an_response.__dict__}'")
diff --git a/obsei_module/example/daily_get_example.py b/obsei_module/example/daily_get_example.py
new file mode 100644
index 0000000000000000000000000000000000000000..5b8209b21f1113035aa7f45a3b419e77fbac73e2
--- /dev/null
+++ b/obsei_module/example/daily_get_example.py
@@ -0,0 +1,77 @@
+import logging
+import os
+import sys
+from pathlib import Path
+
+from obsei.sink.dailyget_sink import DailyGetSink, DailyGetSinkConfig
+from obsei.source.twitter_source import TwitterSource, TwitterSourceConfig
+from obsei.analyzer.classification_analyzer import (
+ ClassificationAnalyzerConfig,
+ ZeroShotClassificationAnalyzer,
+)
+
+logger = logging.getLogger(__name__)
+logging.basicConfig(stream=sys.stdout, level=logging.INFO)
+
+sink_config = DailyGetSinkConfig(
+ url=os.environ["DAILYGET_URL"],
+ partner_id=os.environ["DAILYGET_PARTNER_ID"],
+ consumer_phone_number=os.environ["DAILYGET_CONSUMER_NUMBER"],
+ source_information="Twitter " + os.environ["DAILYGET_QUERY"],
+ base_payload={
+ "partnerId": os.environ["DAILYGET_PARTNER_ID"],
+ "consumerPhoneNumber": os.environ["DAILYGET_CONSUMER_NUMBER"],
+ },
+)
+
+dir_path = Path(__file__).resolve().parent.parent
+source_config = TwitterSourceConfig(
+ keywords=[os.environ["DAILYGET_QUERY"]],
+ lookup_period=os.environ["DAILYGET_LOOKUP_PERIOD"],
+ tweet_fields=[
+ "author_id",
+ "conversation_id",
+ "created_at",
+ "id",
+ "public_metrics",
+ "text",
+ ],
+ user_fields=["id", "name", "public_metrics", "username", "verified"],
+ expansions=["author_id"],
+ place_fields=None,
+ max_tweets=10,
+)
+
+source = TwitterSource()
+sink = DailyGetSink()
+text_analyzer = ZeroShotClassificationAnalyzer(
+ model_name_or_path="joeddav/bart-large-mnli-yahoo-answers",
+ # model_name_or_path="joeddav/xlm-roberta-large-xnli",
+)
+
+source_response_list = source.lookup(source_config)
+for idx, source_response in enumerate(source_response_list):
+ logger.info(f"source_response#'{idx}'='{source_response.__dict__}'")
+
+analyzer_response_list = text_analyzer.analyze_input(
+ source_response_list=source_response_list,
+ analyzer_config=ClassificationAnalyzerConfig(
+ labels=[
+ "service",
+ "delay",
+ "tracking",
+ "no response",
+ "missing items",
+ "delivery",
+ "mask",
+ ],
+ ),
+)
+for idx, an_response in enumerate(analyzer_response_list):
+ logger.info(f"analyzer_response#'{idx}'='{an_response.__dict__}'")
+
+# HTTP Sink
+sink_response_list = sink.send_data(analyzer_response_list, sink_config)
+for sink_response in sink_response_list:
+ if sink_response is not None:
+ logger.info(f"sink_response='{sink_response.__dict__}'")
diff --git a/obsei_module/example/elasticsearch_example.py b/obsei_module/example/elasticsearch_example.py
new file mode 100644
index 0000000000000000000000000000000000000000..81bc7594ab85cf66d6259d7b41c15ebf12c473fc
--- /dev/null
+++ b/obsei_module/example/elasticsearch_example.py
@@ -0,0 +1,69 @@
+import logging
+import sys
+from pathlib import Path
+
+from obsei.sink.elasticsearch_sink import ElasticSearchSink, ElasticSearchSinkConfig
+from obsei.source.twitter_source import TwitterSource, TwitterSourceConfig
+from obsei.analyzer.classification_analyzer import (
+ ClassificationAnalyzerConfig,
+ ZeroShotClassificationAnalyzer,
+)
+
+logger = logging.getLogger(__name__)
+logging.basicConfig(stream=sys.stdout, level=logging.INFO)
+
+dir_path = Path(__file__).resolve().parent.parent
+source_config = TwitterSourceConfig(
+ keywords="@Handle",
+ lookup_period="1h", # 1 Hour
+ tweet_fields=[
+ "author_id",
+ "conversation_id",
+ "created_at",
+ "id",
+ "public_metrics",
+ "text",
+ ],
+ user_fields=["id", "name", "public_metrics", "username", "verified"],
+ expansions=["author_id"],
+ place_fields=None,
+ max_tweets=10,
+)
+
+source = TwitterSource()
+text_analyzer = ZeroShotClassificationAnalyzer(
+ model_name_or_path="joeddav/bart-large-mnli-yahoo-answers",
+)
+
+# Start Elasticsearch server locally
+# `docker run -d --name elasticsearch -p 9200:9200 -e "discovery.type=single-node" elasticsearch:7.9.2`
+sink_config = ElasticSearchSinkConfig(
+ host="localhost",
+ port=9200,
+ index_name="test",
+)
+
+source_response_list = source.lookup(source_config)
+for idx, source_response in enumerate(source_response_list):
+ logger.info(f"source_response#'{idx}'='{source_response.__dict__}'")
+
+analyzer_response_list = text_analyzer.analyze_input(
+ source_response_list=source_response_list,
+ analyzer_config=ClassificationAnalyzerConfig(
+ labels=[
+ "service",
+ "delay",
+ "tracking",
+ "no response",
+ "missing items",
+ "delivery",
+ "mask",
+ ],
+ ),
+)
+for idx, an_response in enumerate(analyzer_response_list):
+ logger.info(f"analyzer_response#'{idx}'='{an_response.__dict__}'")
+
+sink = ElasticSearchSink()
+sink_response = sink.send_data(analyzer_response_list, sink_config)
+logger.info(f"sink_response='{sink_response}'")
diff --git a/obsei_module/example/email_source_example.py b/obsei_module/example/email_source_example.py
new file mode 100644
index 0000000000000000000000000000000000000000..414819c8c56a5de328d7c7dbe694a5d9d5f4f2ef
--- /dev/null
+++ b/obsei_module/example/email_source_example.py
@@ -0,0 +1,36 @@
+import logging
+import os
+import sys
+from datetime import datetime, timedelta
+
+import pytz
+
+from obsei.misc.utils import DATETIME_STRING_PATTERN
+from obsei.source.email_source import EmailConfig, EmailCredInfo, EmailSource
+
+logger = logging.getLogger(__name__)
+logging.basicConfig(stream=sys.stdout, level=logging.INFO)
+
+since_time = datetime.utcnow().astimezone(pytz.utc) + timedelta(hours=-10)
+
+# List of IMAP servers for most commonly used email providers
+# https://www.systoolsgroup.com/imap/
+# Also, if you're using a Gmail account then make sure you allow less secure apps on your account -
+# https://myaccount.google.com/lesssecureapps?pli=1
+# Also enable IMAP access -
+# https://mail.google.com/mail/u/0/#settings/fwdandpop
+source_config = EmailConfig(
+ imap_server="imap.gmail.com",
+ cred_info=EmailCredInfo(
+ # It will fetch username and password from environment variable
+ username=os.environ.get("email_username"),
+ password=os.environ.get("email_password"),
+ ),
+ lookup_period=since_time.strftime(DATETIME_STRING_PATTERN),
+)
+
+source = EmailSource()
+source_response_list = source.lookup(source_config)
+
+for source_response in source_response_list:
+ logger.info(source_response.__dict__)
diff --git a/obsei_module/example/facebook_example.py b/obsei_module/example/facebook_example.py
new file mode 100644
index 0000000000000000000000000000000000000000..207e1eb005288648bc2c67f15150496e3fd66ab9
--- /dev/null
+++ b/obsei_module/example/facebook_example.py
@@ -0,0 +1,19 @@
+import logging
+import sys
+
+from obsei.source.facebook_source import FacebookSource, FacebookSourceConfig
+
+logger = logging.getLogger(__name__)
+logging.basicConfig(stream=sys.stdout, level=logging.INFO)
+
+source_config = FacebookSourceConfig(page_id="110844591144719", lookup_period="2M")
+source = FacebookSource()
+source_response_list = source.lookup(source_config)
+
+logger.info("DETAILS:")
+for source_response in source_response_list:
+ logger.info(source_response)
+
+logger.info("TEXT:")
+for source_response in source_response_list:
+ logger.info(source_response.processed_text)
diff --git a/obsei_module/example/google_news_example.py b/obsei_module/example/google_news_example.py
new file mode 100644
index 0000000000000000000000000000000000000000..40277f61d7f7b37f62154dd84ed7bb2003a35e9a
--- /dev/null
+++ b/obsei_module/example/google_news_example.py
@@ -0,0 +1,58 @@
+from obsei.analyzer.classification_analyzer import (
+ ClassificationAnalyzerConfig,
+ ZeroShotClassificationAnalyzer,
+)
+from obsei.source.google_news_source import GoogleNewsConfig, GoogleNewsSource
+
+# Only fetch title and highlight
+source_config_without_full_text = GoogleNewsConfig(
+ query="ai",
+ max_results=150,
+ after_date='2023-12-01',
+ before_date='2023-12-31',
+)
+
+# Fetch full news article
+source_config_with_full_text = GoogleNewsConfig(
+ query="ai",
+ max_results=5,
+ fetch_article=True,
+ lookup_period="1d",
+ # proxy="http://127.0.0.1:8080"
+)
+
+source = GoogleNewsSource()
+
+analyzer_config = ClassificationAnalyzerConfig(
+ labels=["buy", "sell", "going up", "going down"],
+)
+
+text_analyzer = ZeroShotClassificationAnalyzer(
+ model_name_or_path="typeform/mobilebert-uncased-mnli", device="auto"
+)
+
+news_articles_without_full_text = source.lookup(source_config_without_full_text)
+
+news_articles_with_full_text = source.lookup(source_config_with_full_text)
+
+
+analyzer_responses_without_full_text = text_analyzer.analyze_input(
+ source_response_list=news_articles_without_full_text,
+ analyzer_config=analyzer_config,
+)
+
+analyzer_responses_with_full_text = text_analyzer.analyze_input(
+ source_response_list=news_articles_with_full_text, analyzer_config=analyzer_config
+)
+
+for article in news_articles_without_full_text:
+ print(article.__dict__)
+
+for response in analyzer_responses_without_full_text:
+ print(response.__dict__)
+
+for article in news_articles_with_full_text:
+ print(article.__dict__)
+
+for response in analyzer_responses_with_full_text:
+ print(response.__dict__)
diff --git a/obsei_module/example/jira_example.py b/obsei_module/example/jira_example.py
new file mode 100644
index 0000000000000000000000000000000000000000..00b59600731b689da9ab57bffd71285e729754e3
--- /dev/null
+++ b/obsei_module/example/jira_example.py
@@ -0,0 +1,77 @@
+# Jira Sink
+import logging
+import os
+import sys
+from pathlib import Path
+
+from pydantic import SecretStr
+
+from obsei.sink.jira_sink import JiraSink, JiraSinkConfig
+from obsei.source.twitter_source import (
+ TwitterCredentials,
+ TwitterSource,
+ TwitterSourceConfig,
+)
+from obsei.analyzer.classification_analyzer import (
+ ClassificationAnalyzerConfig,
+ ZeroShotClassificationAnalyzer,
+)
+
+logger = logging.getLogger(__name__)
+logging.basicConfig(stream=sys.stdout, level=logging.INFO)
+
+dir_path = Path(__file__).resolve().parent.parent
+source_config = TwitterSourceConfig(
+ keywords=["facing issue"],
+ lookup_period="1h",
+ tweet_fields=[
+ "author_id",
+ "conversation_id",
+ "created_at",
+ "id",
+ "public_metrics",
+ "text",
+ ],
+ user_fields=["id", "name", "public_metrics", "username", "verified"],
+ expansions=["author_id"],
+ place_fields=None,
+ max_tweets=10,
+ cred_info=TwitterCredentials(
+ consumer_key=SecretStr(os.environ["twitter_consumer_key"]),
+ consumer_secret=SecretStr(os.environ["twitter_consumer_secret"]),
+ ),
+)
+
+source = TwitterSource()
+
+# To start jira server locally `atlas-run-standalone --product jira`
+jira_sink_config = JiraSinkConfig(
+ url="http://localhost:2990/jira",
+ username=SecretStr("admin"),
+ password=SecretStr("admin"),
+ issue_type={"name": "Task"},
+ project={"key": "CUS"},
+)
+jira_sink = JiraSink()
+
+text_analyzer = ZeroShotClassificationAnalyzer(
+ model_name_or_path="joeddav/bart-large-mnli-yahoo-answers"
+)
+
+source_response_list = source.lookup(source_config)
+for idx, source_response in enumerate(source_response_list):
+ logger.info(f"source_response#'{idx}'='{source_response.__dict__}'")
+
+analyzer_response_list = text_analyzer.analyze_input(
+ source_response_list=source_response_list,
+ analyzer_config=ClassificationAnalyzerConfig(
+ labels=["service", "delay", "performance"],
+ ),
+)
+for idx, an_response in enumerate(analyzer_response_list):
+ logger.info(f"analyzer_response#'{idx}'='{an_response.__dict__}'")
+
+sink_response_list = jira_sink.send_data(analyzer_response_list, jira_sink_config)
+for sink_response in sink_response_list:
+ if sink_response is not None:
+ logger.info(f"sink_response='{sink_response}'")
diff --git a/obsei_module/example/maps_review_scrapper_example.py b/obsei_module/example/maps_review_scrapper_example.py
new file mode 100644
index 0000000000000000000000000000000000000000..d0633a025bb9fc15c933bd5a5a4058a0012e6392
--- /dev/null
+++ b/obsei_module/example/maps_review_scrapper_example.py
@@ -0,0 +1,22 @@
+import logging
+import sys
+
+from obsei.source.google_maps_reviews import (OSGoogleMapsReviewsConfig,
+ OSGoogleMapsReviewsSource)
+
+logger = logging.getLogger(__name__)
+logging.basicConfig(stream=sys.stdout, level=logging.INFO)
+
+source_config = OSGoogleMapsReviewsConfig(
+ api_key="", # Get API key from https://outscraper.com/
+ queries=[
+ "https://www.google.co.in/maps/place/Taj+Mahal/@27.1751496,78.0399535,17z/data=!4m5!3m4!1s0x39747121d702ff6d:0xdd2ae4803f767dde!8m2!3d27.1751448!4d78.0421422"
+ ],
+ number_of_reviews=3,
+)
+
+source = OSGoogleMapsReviewsSource()
+
+source_response_list = source.lookup(source_config)
+for source_response in source_response_list:
+ logger.info(source_response.__dict__)
diff --git a/obsei_module/example/pandas_sink_example.py b/obsei_module/example/pandas_sink_example.py
new file mode 100644
index 0000000000000000000000000000000000000000..b2d817ad0c369e4b81eda9f754f149c50c2875c8
--- /dev/null
+++ b/obsei_module/example/pandas_sink_example.py
@@ -0,0 +1,49 @@
+import logging
+import sys
+
+from pandas import DataFrame
+
+from obsei.analyzer.classification_analyzer import (
+ ClassificationAnalyzerConfig,
+ ZeroShotClassificationAnalyzer,
+)
+from obsei.sink.pandas_sink import PandasSink, PandasSinkConfig
+from obsei.source.playstore_scrapper import (
+ PlayStoreScrapperConfig,
+ PlayStoreScrapperSource,
+)
+
+
+logger = logging.getLogger(__name__)
+logging.basicConfig(stream=sys.stdout, level=logging.INFO)
+
+source_config = PlayStoreScrapperConfig(
+ countries=["us"], package_name="com.apcoaconnect", max_count=3
+)
+
+source = PlayStoreScrapperSource()
+
+text_analyzer = ZeroShotClassificationAnalyzer(
+ model_name_or_path="typeform/mobilebert-uncased-mnli", device="auto"
+)
+
+# initialize pandas sink config
+sink_config = PandasSinkConfig(dataframe=DataFrame())
+
+# initialize pandas sink
+sink = PandasSink()
+
+source_response_list = source.lookup(source_config)
+
+analyzer_response_list = text_analyzer.analyze_input(
+ source_response_list=source_response_list,
+ analyzer_config=ClassificationAnalyzerConfig(
+ labels=["no parking", "registration issue", "app issue", "payment issue"],
+ ),
+)
+
+dataframe = sink.send_data(
+ analyzer_responses=analyzer_response_list, config=sink_config
+)
+
+print(dataframe.to_csv())
diff --git a/obsei_module/example/pandas_source_example.py b/obsei_module/example/pandas_source_example.py
new file mode 100644
index 0000000000000000000000000000000000000000..5a82af3ac3cb46fd4b08de92b00b04754249511c
--- /dev/null
+++ b/obsei_module/example/pandas_source_example.py
@@ -0,0 +1,27 @@
+import pandas as pd
+
+from obsei.source.pandas_source import (
+ PandasSourceConfig,
+ PandasSource,
+)
+import logging
+import sys
+
+logger = logging.getLogger(__name__)
+logging.basicConfig(stream=sys.stdout, level=logging.INFO)
+
+# Initialize your Pandas DataFrame from your sources like csv, excel, sql etc
+# In following example we are reading csv which have two columns title and text
+csv_file = "https://raw.githubusercontent.com/deepset-ai/haystack/master/tutorials/small_generator_dataset.csv"
+dataframe = pd.read_csv(csv_file)
+
+source_config = PandasSourceConfig(
+ dataframe=dataframe,
+ include_columns=["title"],
+ text_columns=["text"],
+)
+source = PandasSource()
+
+source_response_list = source.lookup(source_config)
+for idx, source_response in enumerate(source_response_list):
+ logger.info(f"source_response#'{idx}'='{source_response.__dict__}'")
diff --git a/obsei_module/example/pii_analyzer_example.py b/obsei_module/example/pii_analyzer_example.py
new file mode 100644
index 0000000000000000000000000000000000000000..36ec4ff72c3f5221ccbc7c35d74897619ce69514
--- /dev/null
+++ b/obsei_module/example/pii_analyzer_example.py
@@ -0,0 +1,33 @@
+import logging
+import sys
+
+from obsei.payload import TextPayload
+from obsei.analyzer.pii_analyzer import (
+ PresidioEngineConfig,
+ PresidioModelConfig,
+ PresidioPIIAnalyzer,
+ PresidioPIIAnalyzerConfig,
+)
+
+logger = logging.getLogger(__name__)
+logging.basicConfig(stream=sys.stdout, level=logging.INFO)
+
+analyzer_config = PresidioPIIAnalyzerConfig(
+ analyze_only=False, return_decision_process=True
+)
+analyzer = PresidioPIIAnalyzer(
+ engine_config=PresidioEngineConfig(
+ nlp_engine_name="spacy",
+ models=[PresidioModelConfig(model_name="en_core_web_lg", lang_code="en")],
+ )
+)
+
+text_to_anonymize = "His name is Mr. Jones and his phone number is 212-555-5555"
+
+analyzer_results = analyzer.analyze_input(
+ source_response_list=[TextPayload(processed_text=text_to_anonymize)],
+ analyzer_config=analyzer_config,
+)
+
+for analyzer_result in analyzer_results:
+ logging.info(analyzer_result.to_dict())
diff --git a/obsei_module/example/play_store_reviews_example.py b/obsei_module/example/play_store_reviews_example.py
new file mode 100644
index 0000000000000000000000000000000000000000..d37669a7a4441ce69be05152c7dad7aad5edd538
--- /dev/null
+++ b/obsei_module/example/play_store_reviews_example.py
@@ -0,0 +1,4 @@
+# TDB
+
+# Need proper service account file to test the changes :(
+print("TBD")
diff --git a/obsei_module/example/playstore_scrapper_example.py b/obsei_module/example/playstore_scrapper_example.py
new file mode 100644
index 0000000000000000000000000000000000000000..00b1a5406f9c9785bd08262559edca0309832617
--- /dev/null
+++ b/obsei_module/example/playstore_scrapper_example.py
@@ -0,0 +1,40 @@
+import logging
+import sys
+
+from obsei.analyzer.classification_analyzer import (
+ ClassificationAnalyzerConfig,
+ ZeroShotClassificationAnalyzer,
+)
+
+from obsei.source.playstore_scrapper import (
+ PlayStoreScrapperConfig,
+ PlayStoreScrapperSource,
+)
+
+
+logger = logging.getLogger(__name__)
+logging.basicConfig(stream=sys.stdout, level=logging.INFO)
+
+source_config = PlayStoreScrapperConfig(
+ app_url='https://play.google.com/store/apps/details?id=com.google.android.gm&hl=en_IN&gl=US',
+ max_count=3
+)
+
+source = PlayStoreScrapperSource()
+
+text_analyzer = ZeroShotClassificationAnalyzer(
+ model_name_or_path="typeform/mobilebert-uncased-mnli", device="auto"
+)
+
+source_response_list = source.lookup(source_config)
+for idx, source_response in enumerate(source_response_list):
+ logger.info(f"source_response#'{idx}'='{source_response.__dict__}'")
+
+analyzer_response_list = text_analyzer.analyze_input(
+ source_response_list=source_response_list,
+ analyzer_config=ClassificationAnalyzerConfig(
+ labels=["interface", "slow", "battery"],
+ ),
+)
+for idx, an_response in enumerate(analyzer_response_list):
+ logger.info(f"analyzer_response#'{idx}'='{an_response.__dict__}'")
diff --git a/obsei_module/example/playstore_scrapper_translator_example.py b/obsei_module/example/playstore_scrapper_translator_example.py
new file mode 100644
index 0000000000000000000000000000000000000000..e89e09be4a17334368fcfe44039fa294728d020e
--- /dev/null
+++ b/obsei_module/example/playstore_scrapper_translator_example.py
@@ -0,0 +1,86 @@
+import json
+import logging
+import sys
+from datetime import datetime, timedelta
+
+import pytz
+
+from obsei.payload import TextPayload
+from obsei.analyzer.classification_analyzer import (
+ ClassificationAnalyzerConfig,
+ ZeroShotClassificationAnalyzer,
+)
+from obsei.analyzer.translation_analyzer import TranslationAnalyzer
+from obsei.misc.utils import DATETIME_STRING_PATTERN
+from obsei.source.playstore_scrapper import (
+ PlayStoreScrapperConfig,
+ PlayStoreScrapperSource,
+)
+
+
+logger = logging.getLogger(__name__)
+logging.basicConfig(stream=sys.stdout, level=logging.INFO)
+source = PlayStoreScrapperSource()
+
+
+def source_fetch():
+ since_time = datetime.utcnow().astimezone(pytz.utc) + timedelta(days=-1)
+ source_config = PlayStoreScrapperConfig(
+ countries=["us"],
+ package_name="com.color.apps.hindikeyboard.hindi.language",
+ lookup_period=since_time.strftime(
+ DATETIME_STRING_PATTERN
+ ), # todo should be optional
+ max_count=5,
+ )
+ return source.lookup(source_config)
+
+
+def translate_text(text_list):
+ translate_analyzer = TranslationAnalyzer(
+ model_name_or_path="Helsinki-NLP/opus-mt-hi-en", device="auto"
+ )
+ source_responses = [
+ TextPayload(processed_text=text.processed_text, source_name="sample")
+ for text in text_list
+ ]
+ analyzer_responses = translate_analyzer.analyze_input(
+ source_response_list=source_responses
+ )
+ return [
+ TextPayload(
+ processed_text=response.segmented_data["translated_text"],
+ source_name="translator",
+ )
+ for response in analyzer_responses
+ ]
+
+
+def classify_text(text_list):
+ text_analyzer = ZeroShotClassificationAnalyzer(
+ model_name_or_path="joeddav/bart-large-mnli-yahoo-answers", device="cpu"
+ )
+
+ return text_analyzer.analyze_input(
+ source_response_list=text_list,
+ analyzer_config=ClassificationAnalyzerConfig(
+ labels=["no parking", "registration issue", "app issue", "payment issue"],
+ ),
+ )
+
+
+def print_list(text_name, text_list):
+ for idx, text in enumerate(text_list):
+ json_response = json.dumps(text.__dict__, indent=4, sort_keys=True, default=str)
+ logger.info(f"\n{text_name}#'{idx}'='{json_response}'")
+
+
+logger.info("Started...")
+
+source_responses_list = source_fetch()
+translated_text_list = translate_text(source_responses_list)
+analyzer_response_list = classify_text(translated_text_list)
+
+print_list("source_response", source_responses_list)
+print_list("translator_response", translated_text_list)
+print_list("classifier_response", analyzer_response_list)
diff --git a/obsei_module/example/reddit_example.py b/obsei_module/example/reddit_example.py
new file mode 100644
index 0000000000000000000000000000000000000000..fdf3a8d60c0058e8cdde32914d1b984d7cbc848f
--- /dev/null
+++ b/obsei_module/example/reddit_example.py
@@ -0,0 +1,50 @@
+import logging
+import sys
+import time
+from datetime import datetime, timedelta
+
+import pytz
+
+from obsei.misc.utils import DATETIME_STRING_PATTERN
+from obsei.source.reddit_source import RedditConfig, RedditSource
+from obsei.workflow.store import WorkflowStore
+from obsei.workflow.workflow import Workflow, WorkflowConfig
+
+
+def print_state(id: str):
+ logger.info(f"Source State: {source.store.get_source_state(id)}")
+
+
+logger = logging.getLogger(__name__)
+logging.basicConfig(stream=sys.stdout, level=logging.INFO)
+
+since_time = datetime.utcnow().astimezone(pytz.utc) + timedelta(hours=-2)
+# Credentials will be fetched from env variable named reddit_client_id and reddit_client_secret
+source_config = RedditConfig(
+ subreddits=["wallstreetbets"],
+ lookup_period=since_time.strftime(DATETIME_STRING_PATTERN),
+)
+
+source = RedditSource(store=WorkflowStore())
+
+workflow = Workflow(
+ config=WorkflowConfig(
+ source_config=source_config,
+ ),
+)
+source.store.add_workflow(workflow)
+
+
+for i in range(1, 4):
+ print_state(workflow.id)
+ source_response_list = source.lookup(source_config, id=workflow.id)
+
+ if source_response_list is None or len(source_response_list) == 0:
+ break
+
+ for source_response in source_response_list:
+ logger.info(source_response.__dict__)
+
+ time.sleep(10)
+
+print_state(workflow.id)
diff --git a/obsei_module/example/reddit_scrapper_example.py b/obsei_module/example/reddit_scrapper_example.py
new file mode 100644
index 0000000000000000000000000000000000000000..f306024440302f61ebf8f95f29adef98df0f8aaf
--- /dev/null
+++ b/obsei_module/example/reddit_scrapper_example.py
@@ -0,0 +1,30 @@
+import logging
+import sys
+from datetime import datetime, timedelta
+
+import pytz
+
+from obsei.misc.utils import DATETIME_STRING_PATTERN
+from obsei.source.reddit_scrapper import RedditScrapperConfig, RedditScrapperSource
+
+
+def print_state(id: str):
+ logger.info(f"Source State: {source.store.get_source_state(id)}")
+
+
+logger = logging.getLogger(__name__)
+logging.basicConfig(stream=sys.stdout, level=logging.INFO)
+
+since_time = datetime.utcnow().astimezone(pytz.utc) + timedelta(days=-1)
+
+source_config = RedditScrapperConfig(
+ url="https://www.reddit.com/r/wallstreetbets/comments/.rss?sort=new",
+ user_agent="testscript by u/FitStatistician7378",
+ lookup_period=since_time.strftime(DATETIME_STRING_PATTERN),
+)
+
+source = RedditScrapperSource()
+
+source_response_list = source.lookup(source_config)
+for source_response in source_response_list:
+ logger.info(source_response.__dict__)
diff --git a/obsei_module/example/sdk.yaml b/obsei_module/example/sdk.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..89f5d55d3e42af1fc5fe1be71f2d92930342dde4
--- /dev/null
+++ b/obsei_module/example/sdk.yaml
@@ -0,0 +1,97 @@
+twitter_source:
+ _target_: obsei.source.twitter_source.TwitterSourceConfig
+ keywords:
+ - "@sample"
+ lookup_period: "1d"
+ tweet_fields:
+ - "author_id"
+ - "conversation_id"
+ - "created_at"
+ - "id"
+ - "public_metrics"
+ - "text"
+ user_fields:
+ - "id"
+ - "name"
+ - "public_metrics"
+ - "username"
+ - "verified"
+ expansions:
+ - "author_id"
+ place_fields: []
+ max_tweets: 10
+ credential:
+ _target_: obsei.source.twitter_source.TwitterCredentials
+ bearer_token: "bearer_token"
+
+play_store_source:
+ _target_: obsei.source.playstore_reviews.PlayStoreConfig
+ package_name: "com.company.package"
+ max_results: 10
+ num_retries: 2
+ cred_info:
+ _target_: obsei.source.playstore_reviews.GoogleCredInfo
+ service_cred_file: "foo/credential.json"
+ developer_key: "test_key"
+
+daily_get_sink:
+ _target_: obsei.sink.dailyget_sink.DailyGetSinkConfig
+ url: "http://localhost:8080/sample"
+ partner_id: "123456"
+ consumer_phone_number: "1234567890"
+ source_information: "Twitter @sample"
+ base_payload:
+ partnerId: daily_get_sink.partner_id
+ consumerPhoneNumber: daily_get_sink.consumer_phone_number
+
+http_sink:
+ _target_: obsei.sink.http_sink.HttpSinkConfig
+ url: "http://localhost:8080/sample"
+
+elasticsearch_sink:
+ _target_: obsei.sink.elasticsearch_sink.ElasticSearchSinkConfig
+ host: "localhost"
+ port: 9200
+ index_name: "test"
+
+jira_sink:
+ _target_: obsei.sink.jira_sink.JiraSinkConfig
+ url: "http://localhost:2990/jira"
+ username: "user"
+ password: "pass"
+ issue_type:
+ name: "Task"
+ project:
+ key: "CUS"
+
+analyzer_config:
+ _target_: obsei.analyzer.classification_analyzer.ClassificationAnalyzerConfig
+ labels:
+ - "service"
+ - "delay"
+ - "tracking"
+ - "no response"
+ add_positive_negative_labels: false
+
+analyzer:
+ _target_: obsei.analyzer.classification_analyzer.ZeroShotClassificationAnalyzer
+ model_name_or_path: "typeform/mobilebert-uncased-mnli"
+ device: "auto"
+
+slack_sink:
+ _target_: obsei.sink.SlackSink
+
+slack_sink_config:
+ _target_: obsei.sink.SlackSinkConfig
+ slack_token: 'Enter token'
+ channel_id: 'slack channel id'
+ jinja_template: |
+ ```
+ {%- for key, value in payload.items() recursive%}
+ {%- if value is mapping -%}
+ {{loop(value.items())}}
+ {%- else %}
+ {{key}}: {{value}}
+ {%- endif %}
+ {%- endfor%}
+ ```
diff --git a/obsei_module/example/slack_example.py b/obsei_module/example/slack_example.py
new file mode 100644
index 0000000000000000000000000000000000000000..4d7d8a6d1730a7ef37217940eaafd5d96b40f2b0
--- /dev/null
+++ b/obsei_module/example/slack_example.py
@@ -0,0 +1,66 @@
+import logging
+import os
+import sys
+
+from obsei.analyzer.dummy_analyzer import DummyAnalyzer, DummyAnalyzerConfig
+from obsei.processor import Processor
+from obsei.sink.slack_sink import SlackSink, SlackSinkConfig
+from obsei.source.playstore_scrapper import (PlayStoreScrapperConfig,
+ PlayStoreScrapperSource)
+from obsei.workflow.store import WorkflowStore
+from obsei.workflow.workflow import Workflow, WorkflowConfig
+
+
+def print_state(identifier: str):
+ logger.info(f"Source State: {source.store.get_source_state(identifier)}")
+
+
+logger = logging.getLogger(__name__)
+logging.basicConfig(stream=sys.stdout, level=logging.INFO)
+
+
+workflow_store = WorkflowStore()
+
+source_config = PlayStoreScrapperConfig(
+ app_url='https://play.google.com/store/apps/details?id=com.google.android.gm&hl=en_IN&gl=US',
+ max_count=3
+)
+
+source = PlayStoreScrapperSource(store=workflow_store)
+
+sink_config = SlackSinkConfig(
+ slack_token=os.environ["SLACK_TOKEN"],
+ channel_id="C01TUPZ23NZ",
+ jinja_template="""
+```
+ {%- for key, value in payload.items() recursive%}
+ {%- if value is mapping -%}
+{{loop(value.items())}}
+ {%- else %}
+{{key}}: {{value}}
+ {%- endif %}
+ {%- endfor%}
+```
+ """
+)
+sink = SlackSink(store=workflow_store)
+
+analyzer_config = DummyAnalyzerConfig()
+analyzer = DummyAnalyzer()
+
+workflow = Workflow(
+ config=WorkflowConfig(
+ source_config=source_config,
+ sink_config=sink_config,
+ analyzer_config=analyzer_config,
+ ),
+)
+workflow_store.add_workflow(workflow)
+
+processor = Processor(
+ analyzer=analyzer, sink=sink, source=source, analyzer_config=analyzer_config
+)
+
+processor.process(workflow=workflow)
+
+print_state(workflow.id)
diff --git a/obsei_module/example/twitter_source_example.py b/obsei_module/example/twitter_source_example.py
new file mode 100644
index 0000000000000000000000000000000000000000..fc92b681dd4b2ea248162954c0ee2d2306b110f0
--- /dev/null
+++ b/obsei_module/example/twitter_source_example.py
@@ -0,0 +1,98 @@
+import logging
+import sys
+
+from obsei.analyzer.classification_analyzer import ZeroShotClassificationAnalyzer, ClassificationAnalyzerConfig
+from obsei.sink.slack_sink import SlackSinkConfig, SlackSink
+from obsei.source.twitter_source import TwitterSourceConfig, TwitterSource, TwitterCredentials
+
+logger = logging.getLogger(__name__)
+logging.basicConfig(stream=sys.stdout, level=logging.INFO)
+
+twitter_cred_info = None
+
+# Enter your twitter credentials
+# Get it from https://developer.twitter.com/en/apply-for-access
+# Currently it will fetch from environment variables: twitter_bearer_token, twitter_consumer_key, twitter_consumer_secret
+# Uncomment below lines if you like to pass credentials directly instead of env variables
+
+# twitter_cred_info = TwitterCredentials(
+# bearer_token='',
+# consumer_key="",
+# consumer_secret=""
+# )
+
+source_config = TwitterSourceConfig(
+ query="bitcoin",
+ lookup_period="1h",
+ tweet_fields=[
+ "author_id",
+ "conversation_id",
+ "created_at",
+ "id",
+ "public_metrics",
+ "text",
+ ],
+ user_fields=["id", "name", "public_metrics", "username", "verified"],
+ expansions=["author_id"],
+ place_fields=None,
+ max_tweets=10,
+ cred_info=twitter_cred_info or None
+)
+
+source = TwitterSource()
+
+
+sink_config = SlackSinkConfig(
+ # Uncomment below lines if you like to pass credentials directly instead of env variables
+ # slack_token="SLACK_TOKEN",
+ # channel_id="CHANNEL_ID",
+ jinja_template="""
+:bell: Hi there!, a new `<{{payload['meta']['tweet_url']}}|tweet>` of interest is found by *Obsei*
+>📝 Content:
+```{{payload['meta']['text']}}```
+>ℹ️Information:
+```
+User Name: {{payload['meta']['author_info']['name']}} ({{payload['meta']['author_info']['user_url']}})
+Tweet Metrics: Retweets={{payload['meta']['public_metrics']['retweet_count']}}, Likes={{payload['meta']['public_metrics']['like_count']}}
+Author Metrics: Verified={{payload['meta']['author_info']['verified']}}, Followers={{payload['meta']['author_info']['public_metrics']['followers_count']}}
+```
+>🧠 AI Engine Data:
+```
+ {%- for key, value in payload['segmented_data']['classifier_data'].items() recursive%}
+ {%- if value is mapping -%}
+{{loop(value.items())}}
+ {%- else %}
+{{key}}: {{value}}
+ {%- endif %}
+ {%- endfor%}
+```
+ """
+)
+sink = SlackSink()
+
+text_analyzer = ZeroShotClassificationAnalyzer(
+ model_name_or_path="typeform/mobilebert-uncased-mnli", device="auto"
+)
+
+analyzer_config = ClassificationAnalyzerConfig(
+ labels=["going up", "going down"],
+ add_positive_negative_labels=False,
+)
+
+source_response_list = source.lookup(source_config)
+for idx, source_response in enumerate(source_response_list):
+ logger.info(f"source_response#'{idx}'='{source_response.__dict__}'")
+
+analyzer_response_list = text_analyzer.analyze_input(
+ source_response_list=source_response_list,
+ analyzer_config=analyzer_config,
+)
+
+for idx, an_response in enumerate(analyzer_response_list):
+ logger.info(f"analyzer_response#'{idx}'='{an_response.__dict__}'")
+
+sink_response_list = sink.send_data(
+ analyzer_responses=analyzer_response_list, config=sink_config, id=id
+)
+for idx, sink_response in enumerate(sink_response_list):
+ logger.info(f"source_response#'{idx}'='{sink_response.__dict__}'")
diff --git a/obsei_module/example/web_crawler_example.py b/obsei_module/example/web_crawler_example.py
new file mode 100644
index 0000000000000000000000000000000000000000..51e5d68ba7ea902781110b1cdf327cba25630383
--- /dev/null
+++ b/obsei_module/example/web_crawler_example.py
@@ -0,0 +1,43 @@
+# Fetch full news article
+from obsei.source.website_crawler_source import (
+ TrafilaturaCrawlerConfig,
+ TrafilaturaCrawlerSource,
+)
+
+
+def print_list(response_list):
+ for response in response_list:
+ print(response.__dict__)
+
+
+# Single URL
+source_config = TrafilaturaCrawlerConfig(urls=["https://obsei.github.io/obsei/"])
+
+source = TrafilaturaCrawlerSource()
+
+source_response_list = source.lookup(source_config)
+print_list(source_response_list)
+
+
+# RSS feed (Note it will take lot of time)
+source_config = TrafilaturaCrawlerConfig(
+ urls=["https://news.google.com/rss/search?q=bitcoin&hl=en&gl=US&ceid=US:en"],
+ is_feed=True,
+)
+
+source = TrafilaturaCrawlerSource()
+
+source_response_list = source.lookup(source_config)
+print_list(source_response_list)
+
+
+# Full website (Note it will take lot of time)
+source_config = TrafilaturaCrawlerConfig(
+ urls=["https://haystack.deepset.ai/"],
+ is_sitemap=True,
+)
+
+source = TrafilaturaCrawlerSource()
+
+source_response_list = source.lookup(source_config)
+print_list(source_response_list)
diff --git a/obsei_module/example/with_sdk_config_file.py b/obsei_module/example/with_sdk_config_file.py
new file mode 100644
index 0000000000000000000000000000000000000000..37da8373fa032c2ceecab54de181ed9e9190ae04
--- /dev/null
+++ b/obsei_module/example/with_sdk_config_file.py
@@ -0,0 +1,28 @@
+import logging
+import sys
+
+from obsei.configuration import ObseiConfiguration
+
+logger = logging.getLogger(__name__)
+logging.basicConfig(stream=sys.stdout, level=logging.INFO)
+
+obsei_configuration = ObseiConfiguration(
+ config_path="../example",
+ config_filename="sdk.yaml",
+)
+
+text_analyzer = obsei_configuration.initialize_instance("analyzer")
+analyzer_config = obsei_configuration.initialize_instance("analyzer_config")
+slack_source_config = obsei_configuration.initialize_instance("slack_sink_config")
+slack_sink = obsei_configuration.initialize_instance("slack_sink")
+
+play_store_source_config = obsei_configuration.initialize_instance("play_store_source")
+twitter_source_config = obsei_configuration.initialize_instance("twitter_source")
+http_sink_config = obsei_configuration.initialize_instance("http_sink")
+daily_get_sink_config = obsei_configuration.initialize_instance("daily_get_sink")
+# docker run -d --name elasticsearch -p 9200:9200 -e "discovery.type=single-node" elasticsearch:7.9.2
+elasticsearch_sink_config = obsei_configuration.initialize_instance(
+ "elasticsearch_sink"
+)
+# Start jira server locally `atlas-run-standalone --product jira`
+jira_sink_config = obsei_configuration.initialize_instance("jira_sink")
diff --git a/obsei_module/example/with_state_example.py b/obsei_module/example/with_state_example.py
new file mode 100644
index 0000000000000000000000000000000000000000..40ac7a0bc68a781907fd720861f035f557c2b67f
--- /dev/null
+++ b/obsei_module/example/with_state_example.py
@@ -0,0 +1,60 @@
+import logging
+import sys
+import time
+
+from obsei.workflow.store import WorkflowStore
+from obsei.source.twitter_source import TwitterSource, TwitterSourceConfig
+from obsei.workflow.workflow import Workflow, WorkflowConfig
+
+logger = logging.getLogger(__name__)
+logging.basicConfig(stream=sys.stdout, level=logging.INFO)
+
+# Create workflow store instance, by default it will use SQLite to store state data
+store = WorkflowStore()
+
+# Pass store reference to observer, so it can use it to store state data
+source = TwitterSource(store=store)
+
+
+def print_state(id: str):
+ logger.info(f"Source State: {source.store.get_source_state(id)}")
+
+
+source_config = TwitterSourceConfig(
+ keywords=["india"],
+ lookup_period="2m",
+ tweet_fields=[
+ "author_id",
+ "conversation_id",
+ "created_at",
+ "id",
+ "public_metrics",
+ "text",
+ ],
+ user_fields=["id", "name", "public_metrics", "username", "verified"],
+ expansions=["author_id"],
+ place_fields=None,
+ max_tweets=10,
+)
+
+# Create instance of workflow, adding observer config to it, it will autgenerate unique workflow id
+workflow = Workflow(
+ config=WorkflowConfig(
+ source_config=source_config,
+ ),
+)
+# Insert workflow config to DB store
+store.add_workflow(workflow)
+
+for i in range(1, 4):
+ print_state(workflow.id)
+ # Now always pass workflow id to lookup function
+ # Observer will fetch old data from DB suing this id and later store new updated state data against this id to DB
+ source_response_list = source.lookup(source_config, id=workflow.id)
+
+ if source_response_list is None or len(source_response_list) == 0:
+ break
+
+ time.sleep(180)
+
+print_state(workflow.id)
diff --git a/obsei_module/example/youtube_scrapper_example.py b/obsei_module/example/youtube_scrapper_example.py
new file mode 100644
index 0000000000000000000000000000000000000000..6bded3d4884a7f7523eea96e9275e648ebd9ad2f
--- /dev/null
+++ b/obsei_module/example/youtube_scrapper_example.py
@@ -0,0 +1,36 @@
+import logging
+import sys
+
+from obsei.analyzer.classification_analyzer import (
+ ClassificationAnalyzerConfig, ZeroShotClassificationAnalyzer)
+from obsei.source.youtube_scrapper import (YoutubeScrapperConfig,
+ YoutubeScrapperSource)
+
+logger = logging.getLogger(__name__)
+logging.basicConfig(stream=sys.stdout, level=logging.INFO)
+
+source_config = YoutubeScrapperConfig(
+ video_url="https://www.youtube.com/watch?v=uZfns0JIlFk",
+ fetch_replies=True,
+ max_comments=10,
+ lookup_period="1Y",
+)
+
+source = YoutubeScrapperSource()
+
+source_response_list = source.lookup(source_config)
+for idx, source_response in enumerate(source_response_list):
+ logger.info(f"source_response#'{idx}'='{source_response.__dict__}'")
+
+text_analyzer = ZeroShotClassificationAnalyzer(
+ model_name_or_path="typeform/mobilebert-uncased-mnli", device="auto"
+)
+
+analyzer_response_list = text_analyzer.analyze_input(
+ source_response_list=source_response_list,
+ analyzer_config=ClassificationAnalyzerConfig(
+ labels=["interesting", "enquiring"],
+ ),
+)
+for idx, an_response in enumerate(analyzer_response_list):
+ logger.info(f"analyzer_response#'{idx}'='{an_response.__dict__}'")
diff --git a/obsei_module/images/Obsei-flow-diagram.png b/obsei_module/images/Obsei-flow-diagram.png
new file mode 100644
index 0000000000000000000000000000000000000000..e25e5fb52066f679c4ee7433cdcb59c19c4b4bd3
Binary files /dev/null and b/obsei_module/images/Obsei-flow-diagram.png differ
diff --git a/obsei_module/images/Obsei-future-concept.png b/obsei_module/images/Obsei-future-concept.png
new file mode 100644
index 0000000000000000000000000000000000000000..84d97eb97c5cc36b2d471789bad8edc23f2ad48d
Binary files /dev/null and b/obsei_module/images/Obsei-future-concept.png differ
diff --git a/obsei_module/images/jira_screenshot.png b/obsei_module/images/jira_screenshot.png
new file mode 100644
index 0000000000000000000000000000000000000000..d14e391487bdf46c5b279dd0f60c6de16302e901
Binary files /dev/null and b/obsei_module/images/jira_screenshot.png differ
diff --git a/obsei_module/images/logos/Slack_join.svg b/obsei_module/images/logos/Slack_join.svg
new file mode 100644
index 0000000000000000000000000000000000000000..1224dfdaba2b72dbfbc3f86a9e9a29aab4dc96b2
--- /dev/null
+++ b/obsei_module/images/logos/Slack_join.svg
@@ -0,0 +1,49 @@
+
+
+
diff --git a/obsei_module/images/logos/appstore.png b/obsei_module/images/logos/appstore.png
new file mode 100644
index 0000000000000000000000000000000000000000..a88786b5cfc6d42af039a0cf0ca15ef0f4ae378f
Binary files /dev/null and b/obsei_module/images/logos/appstore.png differ
diff --git a/obsei_module/images/logos/classification.png b/obsei_module/images/logos/classification.png
new file mode 100644
index 0000000000000000000000000000000000000000..c272cc21dd36d86e3e0fedb387fafe758af59bb1
Binary files /dev/null and b/obsei_module/images/logos/classification.png differ
diff --git a/obsei_module/images/logos/dummy.png b/obsei_module/images/logos/dummy.png
new file mode 100644
index 0000000000000000000000000000000000000000..65a21ae2882d0e26c88e1694425155a81e418507
Binary files /dev/null and b/obsei_module/images/logos/dummy.png differ
diff --git a/obsei_module/images/logos/elastic.png b/obsei_module/images/logos/elastic.png
new file mode 100644
index 0000000000000000000000000000000000000000..47e53a58326b6bf4d5bc8a9a48811c360311a7dd
Binary files /dev/null and b/obsei_module/images/logos/elastic.png differ
diff --git a/obsei_module/images/logos/facebook.png b/obsei_module/images/logos/facebook.png
new file mode 100644
index 0000000000000000000000000000000000000000..b4374d2b0528a0ec9f617a5c7ee33722e98a7d19
Binary files /dev/null and b/obsei_module/images/logos/facebook.png differ
diff --git a/obsei_module/images/logos/gmail.png b/obsei_module/images/logos/gmail.png
new file mode 100644
index 0000000000000000000000000000000000000000..357c439d8d8d7f884a3eeaaf1e1412d73ff5a72c
Binary files /dev/null and b/obsei_module/images/logos/gmail.png differ
diff --git a/obsei_module/images/logos/googlenews.png b/obsei_module/images/logos/googlenews.png
new file mode 100644
index 0000000000000000000000000000000000000000..a7cf45acdb3bb1303a7e2b7ac8f2db9de11da558
Binary files /dev/null and b/obsei_module/images/logos/googlenews.png differ
diff --git a/obsei_module/images/logos/http_api.png b/obsei_module/images/logos/http_api.png
new file mode 100644
index 0000000000000000000000000000000000000000..8bc5c9ae4f47022088572a1753711bf5fd669948
Binary files /dev/null and b/obsei_module/images/logos/http_api.png differ
diff --git a/obsei_module/images/logos/jira.png b/obsei_module/images/logos/jira.png
new file mode 100644
index 0000000000000000000000000000000000000000..d92cf6f845594ace475c07c07cbbcfac0907a660
Binary files /dev/null and b/obsei_module/images/logos/jira.png differ
diff --git a/obsei_module/images/logos/logger.png b/obsei_module/images/logos/logger.png
new file mode 100644
index 0000000000000000000000000000000000000000..34a68ee5a5e73c2b9963ff9284482dcf4effe0f3
Binary files /dev/null and b/obsei_module/images/logos/logger.png differ
diff --git a/obsei_module/images/logos/ner.png b/obsei_module/images/logos/ner.png
new file mode 100644
index 0000000000000000000000000000000000000000..06297eabfb3d5d589943decf931e2ac096ee38d3
Binary files /dev/null and b/obsei_module/images/logos/ner.png differ
diff --git a/obsei_module/images/logos/obsei_200x200.png b/obsei_module/images/logos/obsei_200x200.png
new file mode 100644
index 0000000000000000000000000000000000000000..cb4cb25373412834d3384c0308c8039667111876
Binary files /dev/null and b/obsei_module/images/logos/obsei_200x200.png differ
diff --git a/obsei_module/images/logos/pandas.svg b/obsei_module/images/logos/pandas.svg
new file mode 100644
index 0000000000000000000000000000000000000000..1451f57de198e7283f900a2538212c3ee27458f9
--- /dev/null
+++ b/obsei_module/images/logos/pandas.svg
@@ -0,0 +1,111 @@
+
+
diff --git a/obsei_module/images/logos/pii.png b/obsei_module/images/logos/pii.png
new file mode 100644
index 0000000000000000000000000000000000000000..13a6826f6f8aed02e7b1e89a2a9fac1ff3510481
Binary files /dev/null and b/obsei_module/images/logos/pii.png differ
diff --git a/obsei_module/images/logos/playstore.png b/obsei_module/images/logos/playstore.png
new file mode 100644
index 0000000000000000000000000000000000000000..c054cd04bb47e26fdba2d5b66071a63317182f36
Binary files /dev/null and b/obsei_module/images/logos/playstore.png differ
diff --git a/obsei_module/images/logos/reddit.png b/obsei_module/images/logos/reddit.png
new file mode 100644
index 0000000000000000000000000000000000000000..695eff14557b7ae25f594febd4cd562013fb9c5a
Binary files /dev/null and b/obsei_module/images/logos/reddit.png differ
diff --git a/obsei_module/images/logos/sentiment.png b/obsei_module/images/logos/sentiment.png
new file mode 100644
index 0000000000000000000000000000000000000000..632d49b471815a10ad16e6bdbe0db53a549b6076
Binary files /dev/null and b/obsei_module/images/logos/sentiment.png differ
diff --git a/obsei_module/images/logos/slack.svg b/obsei_module/images/logos/slack.svg
new file mode 100644
index 0000000000000000000000000000000000000000..c37dc5eb49e3ef638f9dd6f4cf9ab345db8c141d
--- /dev/null
+++ b/obsei_module/images/logos/slack.svg
@@ -0,0 +1,33 @@
+
+
+
diff --git a/obsei_module/images/logos/translator.png b/obsei_module/images/logos/translator.png
new file mode 100644
index 0000000000000000000000000000000000000000..2aa0174fa5f132d6474e051e9be503c105f03719
Binary files /dev/null and b/obsei_module/images/logos/translator.png differ
diff --git a/obsei_module/images/logos/twitter.png b/obsei_module/images/logos/twitter.png
new file mode 100644
index 0000000000000000000000000000000000000000..4e72f8cb100d33850ff0141e009afc30cd0c79b0
Binary files /dev/null and b/obsei_module/images/logos/twitter.png differ
diff --git a/obsei_module/images/logos/webcrawler.png b/obsei_module/images/logos/webcrawler.png
new file mode 100644
index 0000000000000000000000000000000000000000..0fd7f3623df18ed0493e08d526167e7cd1e1ee7e
Binary files /dev/null and b/obsei_module/images/logos/webcrawler.png differ
diff --git a/obsei_module/images/logos/zendesk.png b/obsei_module/images/logos/zendesk.png
new file mode 100644
index 0000000000000000000000000000000000000000..78bfb22e62f72e0a034b8e14a8964d9e5c185375
Binary files /dev/null and b/obsei_module/images/logos/zendesk.png differ
diff --git a/obsei_module/images/obsei-flyer.png b/obsei_module/images/obsei-flyer.png
new file mode 100644
index 0000000000000000000000000000000000000000..e94f831d50cceada70f07a5e1f6814201f7fa76b
Binary files /dev/null and b/obsei_module/images/obsei-flyer.png differ
diff --git a/obsei_module/images/obsei-ui-demo.png b/obsei_module/images/obsei-ui-demo.png
new file mode 100644
index 0000000000000000000000000000000000000000..c0ce9bc92418c70a4e5b7210f7ab89024820dc17
Binary files /dev/null and b/obsei_module/images/obsei-ui-demo.png differ
diff --git a/obsei_module/images/obsei_flow.gif b/obsei_module/images/obsei_flow.gif
new file mode 100644
index 0000000000000000000000000000000000000000..a6538b34996b6092ce3c978b1daf8ff9ad4da683
--- /dev/null
+++ b/obsei_module/images/obsei_flow.gif
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2bb0b0b15bac52084145aea23f9b47b207853ce9c45d4c355ccadffadc129bb9
+size 6226733
diff --git a/obsei_module/mypy.ini b/obsei_module/mypy.ini
new file mode 100644
index 0000000000000000000000000000000000000000..976ba0294638950e865be3934cbeee3b6305ffd6
--- /dev/null
+++ b/obsei_module/mypy.ini
@@ -0,0 +1,2 @@
+[mypy]
+ignore_missing_imports = True
diff --git a/obsei_module/obsei-master/.github/ISSUE_TEMPLATE/bug_report.md b/obsei_module/obsei-master/.github/ISSUE_TEMPLATE/bug_report.md
new file mode 100644
index 0000000000000000000000000000000000000000..c6915c4ae905cb402e1dc710b3daafb8f6360df4
--- /dev/null
+++ b/obsei_module/obsei-master/.github/ISSUE_TEMPLATE/bug_report.md
@@ -0,0 +1,27 @@
+---
+name: Bug report
+about: Create a report to help us improve
+title: "[BUG]"
+labels: bug
+assignees: lalitpagaria
+
+---
+
+**Describe the bug**
+A clear and concise description of what the bug is.
+
+**To Reproduce**
+Steps to reproduce the behavior:
+
+**Expected behavior**
+A clear and concise description of what you expected to happen.
+
+**Stacktrace**
+If applicable, add stacktrace to help explain your problem.
+
+**Please complete the following information:**
+ - OS:
+ - Version:
+
+**Additional context**
+Add any other context about the problem here.
diff --git a/obsei_module/obsei-master/.github/ISSUE_TEMPLATE/feature_request.md b/obsei_module/obsei-master/.github/ISSUE_TEMPLATE/feature_request.md
new file mode 100644
index 0000000000000000000000000000000000000000..11fc491ef1dae316f2b06bbb40eaba9c757fdfd1
--- /dev/null
+++ b/obsei_module/obsei-master/.github/ISSUE_TEMPLATE/feature_request.md
@@ -0,0 +1,20 @@
+---
+name: Feature request
+about: Suggest an idea for this project
+title: ''
+labels: enhancement
+assignees: ''
+
+---
+
+**Is your feature request related to a problem? Please describe.**
+A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
+
+**Describe the solution you'd like**
+A clear and concise description of what you want to happen.
+
+**Describe alternatives you've considered**
+A clear and concise description of any alternative solutions or features you've considered.
+
+**Additional context**
+Add any other context or screenshots about the feature request here.
diff --git a/obsei_module/obsei-master/.github/dependabot.yml b/obsei_module/obsei-master/.github/dependabot.yml
new file mode 100644
index 0000000000000000000000000000000000000000..2c7d1708395e202b3b3316391f35bf4183ebd045
--- /dev/null
+++ b/obsei_module/obsei-master/.github/dependabot.yml
@@ -0,0 +1,7 @@
+version: 2
+updates:
+ # Maintain dependencies for GitHub Actions
+ - package-ecosystem: "github-actions"
+ directory: "/"
+ schedule:
+ interval: "daily"
diff --git a/obsei_module/obsei-master/.github/release-drafter.yml b/obsei_module/obsei-master/.github/release-drafter.yml
new file mode 100644
index 0000000000000000000000000000000000000000..794187190e6f3fb290174970df09c18306b58a39
--- /dev/null
+++ b/obsei_module/obsei-master/.github/release-drafter.yml
@@ -0,0 +1,33 @@
+name-template: 'v$RESOLVED_VERSION 🌈'
+tag-template: 'v$RESOLVED_VERSION'
+categories:
+ - title: '🚀 Features'
+ labels:
+ - 'feature'
+ - 'enhancement'
+ - title: '🐛 Bug Fixes'
+ labels:
+ - 'fix'
+ - 'bugfix'
+ - 'bug'
+ - title: '🧰 Maintenance'
+ label: 'chore'
+ - title: '⚠️Breaking Changes'
+ label: 'breaking changes'
+change-template: '- $TITLE @$AUTHOR (#$NUMBER)'
+change-title-escapes: '\<*_&' # You can add # and @ to disable mentions, and add ` to disable code blocks.
+version-resolver:
+ major:
+ labels:
+ - 'major'
+ minor:
+ labels:
+ - 'minor'
+ patch:
+ labels:
+ - 'patch'
+ default: patch
+template: |
+ ## Changes
+
+ $CHANGES
\ No newline at end of file
diff --git a/obsei_module/obsei-master/.github/workflows/build.yml b/obsei_module/obsei-master/.github/workflows/build.yml
new file mode 100644
index 0000000000000000000000000000000000000000..767b04e369bceb740995187c9c3dfda5e3a90325
--- /dev/null
+++ b/obsei_module/obsei-master/.github/workflows/build.yml
@@ -0,0 +1,54 @@
+# This workflow will install Python dependencies, run test and lint with a single version of Python
+# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
+
+name: CI
+
+on:
+ push:
+ branches: [ master ]
+ pull_request:
+ branches: [ master ]
+
+jobs:
+ type-check:
+ runs-on: ubuntu-latest
+ steps:
+ - uses: actions/checkout@v4
+ - uses: actions/setup-python@v5
+ with:
+ python-version: '3.10'
+ - name: Test with mypy
+ run: |
+ pip install mypy
+ # Refer http://mypy-lang.blogspot.com/2021/06/mypy-0900-released.html
+ pip install mypy types-requests types-python-dateutil types-PyYAML types-dateparser types-protobuf types-pytz
+ mypy obsei
+
+ build-and-test:
+ needs: type-check
+ runs-on: ${{ matrix.os }}
+ strategy:
+ fail-fast: false
+ matrix:
+ os: [ ubuntu-latest, macos-latest, windows-latest ]
+ python-version: ['3.8', '3.9', '3.10', '3.11']
+
+ steps:
+ - uses: actions/checkout@v4
+ - name: Set up Python ${{ matrix.python-version }}
+ uses: actions/setup-python@v5
+ with:
+ python-version: ${{ matrix.python-version }}
+
+ - name: Install dependencies
+ run: |
+ python -m pip install --upgrade pip
+ pip install '.[dev,all]'
+ pip install --upgrade --upgrade-strategy eager trafilatura
+ python -m spacy download en_core_web_lg
+ python -m spacy download en_core_web_sm
+
+ - name: Test with pytest
+ run: |
+ coverage run -m pytest
+ coverage report -m
diff --git a/obsei_module/obsei-master/.github/workflows/pypi_publish.yml b/obsei_module/obsei-master/.github/workflows/pypi_publish.yml
new file mode 100644
index 0000000000000000000000000000000000000000..316334bb75c8e3fff0974b52fa85ffadcbb0b289
--- /dev/null
+++ b/obsei_module/obsei-master/.github/workflows/pypi_publish.yml
@@ -0,0 +1,35 @@
+# This workflows will upload a Python Package using Twine when a release is created
+# For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries
+
+name: Upload Python Package
+
+on:
+ workflow_dispatch:
+ release:
+ types: [published]
+
+jobs:
+ deploy-pypi-artifact:
+ runs-on: ubuntu-latest
+
+ steps:
+ - uses: actions/checkout@v4
+
+ - name: Set up Python
+ uses: actions/setup-python@v5
+ with:
+ python-version: '3.8'
+
+ - name: Install dependencies
+ run: |
+ python -m pip install --upgrade pip
+ pip install setuptools wheel twine hatch
+
+ - name: publish to PyPI
+ if: github.event_name != 'pull_request'
+ env:
+ TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
+ TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
+ run: |
+ hatch build
+ twine upload dist/*
diff --git a/obsei_module/obsei-master/.github/workflows/release_draft.yml b/obsei_module/obsei-master/.github/workflows/release_draft.yml
new file mode 100644
index 0000000000000000000000000000000000000000..2ed3737754610ea9c71896646975b34355580b4e
--- /dev/null
+++ b/obsei_module/obsei-master/.github/workflows/release_draft.yml
@@ -0,0 +1,15 @@
+name: release draft
+
+on:
+ workflow_dispatch:
+
+jobs:
+ draft-release:
+# if: startsWith(github.ref, 'refs/tags/')
+ runs-on: ubuntu-latest
+ steps:
+ - uses: release-drafter/release-drafter@v6
+ with:
+ config-name: release-drafter.yml
+ env:
+ GITHUB_TOKEN: ${{ secrets.RELEASE_DRAFT_TOKEN }}
\ No newline at end of file
diff --git a/obsei_module/obsei-master/.github/workflows/sdk_docker_publish.yml b/obsei_module/obsei-master/.github/workflows/sdk_docker_publish.yml
new file mode 100644
index 0000000000000000000000000000000000000000..70daa5e698326d30b5d1cee9ba8a5e9213bda1b7
--- /dev/null
+++ b/obsei_module/obsei-master/.github/workflows/sdk_docker_publish.yml
@@ -0,0 +1,50 @@
+# This workflows will upload a Python Package using Twine when a release is created
+# For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries
+
+name: Publish SDK docker image
+
+on:
+ workflow_dispatch:
+ inputs:
+ tag:
+ description: 'Image tag'
+ required: true
+ release:
+ types: [published]
+
+jobs:
+ deploy-sdk-docker:
+ runs-on: ubuntu-latest
+ steps:
+ - uses: actions/checkout@v4
+
+ - name: Docker meta
+ id: docker_meta
+ uses: docker/metadata-action@v5
+ with:
+ images: obsei/obsei-sdk
+
+ - name: Set up QEMU
+ uses: docker/setup-qemu-action@v3
+
+ - name: Set up Docker Buildx
+ uses: docker/setup-buildx-action@v3
+
+ - name: Login to DockerHub
+ if: github.event_name != 'pull_request'
+ uses: docker/login-action@v3
+ with:
+ username: ${{ secrets.DOCKERHUB_USERNAME }}
+ password: ${{ secrets.DOCKERHUB_TOKEN }}
+
+ - name: Build and push
+ uses: docker/build-push-action@v5
+ with:
+ context: ./
+ file: ./Dockerfile
+ push: ${{ github.event_name != 'pull_request' }}
+ tags: ${{ steps.docker_meta.outputs.tags }}
+ labels: ${{ steps.docker_meta.outputs.labels }}
+
+ - name: Image digest
+ run: echo ${{ steps.docker_build.outputs.digest }}
diff --git a/obsei_module/obsei-master/.github/workflows/ui_docker_publish.yml b/obsei_module/obsei-master/.github/workflows/ui_docker_publish.yml
new file mode 100644
index 0000000000000000000000000000000000000000..2fc690055b8d20c9462412352f2d75f8a6710447
--- /dev/null
+++ b/obsei_module/obsei-master/.github/workflows/ui_docker_publish.yml
@@ -0,0 +1,50 @@
+# This workflows will upload a Python Package using Twine when a release is created
+# For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries
+
+name: Publish UI Docker image
+
+on:
+ workflow_dispatch:
+ inputs:
+ tag:
+ description: 'Image tag'
+ required: true
+ release:
+ types: [published]
+
+jobs:
+ deploy-ui-docker:
+ runs-on: ubuntu-latest
+ steps:
+ - uses: actions/checkout@v4
+
+ - name: Docker meta
+ id: docker_meta
+ uses: docker/metadata-action@v5
+ with:
+ images: obsei/obsei-ui-demo
+
+ - name: Set up QEMU
+ uses: docker/setup-qemu-action@v3
+
+ - name: Set up Docker Buildx
+ uses: docker/setup-buildx-action@v3
+
+ - name: Login to DockerHub
+ if: github.event_name != 'pull_request'
+ uses: docker/login-action@v3
+ with:
+ username: ${{ secrets.DOCKERHUB_USERNAME }}
+ password: ${{ secrets.DOCKERHUB_TOKEN }}
+
+ - name: Build and push
+ uses: docker/build-push-action@v5
+ with:
+ context: "{{defaultContext}}:sample-ui"
+ file: Dockerfile
+ push: ${{ github.event_name != 'pull_request' }}
+ tags: ${{ steps.docker_meta.outputs.tags }}
+ labels: ${{ steps.docker_meta.outputs.labels }}
+
+ - name: Image digest
+ run: echo ${{ steps.docker_build.outputs.digest }}
diff --git a/obsei_module/obsei-master/.gitignore b/obsei_module/obsei-master/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..80dd90d9cb4c179a40e922c4a9482c3afe64a999
--- /dev/null
+++ b/obsei_module/obsei-master/.gitignore
@@ -0,0 +1,148 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+# Usually these files are written by a python script from a template
+# before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+# For a library or package, you might want to ignore these files since the code is
+# intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+# However, in case of collaboration, if having platform-specific dependencies or dependencies
+# having no cross-platform support, pipenv may install dependencies that don't work, or not
+# install all needed dependencies.
+#Pipfile.lock
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+/.idea/*
+*.db
+models*
+
+# OSX custom attributes
+.DS_Store
+
+# VS code configuration
+.vscode/*
diff --git a/obsei_module/obsei-master/.pre-commit-config.yaml b/obsei_module/obsei-master/.pre-commit-config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7631ed863745fe51f97e33d1b98b0aeb5ef43b70
--- /dev/null
+++ b/obsei_module/obsei-master/.pre-commit-config.yaml
@@ -0,0 +1,21 @@
+repos:
+ - repo: https://github.com/pre-commit/pre-commit-hooks
+ rev: v4.3.0
+ hooks:
+ - id: check-yaml
+ - id: trailing-whitespace
+ - id: requirements-txt-fixer
+ - id: end-of-file-fixer
+
+ - repo: https://github.com/psf/black
+ rev: 22.10.0
+ hooks:
+ - id: black
+
+ - repo: https://github.com/pre-commit/mirrors-mypy
+ rev: v0.991
+ hooks:
+ - id: mypy
+ args: [--ignore-missing-imports]
+ additional_dependencies: [types-all]
+ files: ^obsei/
diff --git a/obsei_module/obsei-master/.pyup.yml b/obsei_module/obsei-master/.pyup.yml
new file mode 100644
index 0000000000000000000000000000000000000000..b55ad548d5705a6c15d6f79192892e7612dbc2a3
--- /dev/null
+++ b/obsei_module/obsei-master/.pyup.yml
@@ -0,0 +1,5 @@
+# autogenerated pyup.io config file
+# see https://pyup.io/docs/configuration/ for all available options
+
+schedule: ''
+update: insecure
diff --git a/obsei_module/obsei-master/ATTRIBUTION.md b/obsei_module/obsei-master/ATTRIBUTION.md
new file mode 100644
index 0000000000000000000000000000000000000000..fc6f436d7be74b3ca7d9bbcdcd7d823fb52f7a2e
--- /dev/null
+++ b/obsei_module/obsei-master/ATTRIBUTION.md
@@ -0,0 +1,18 @@
+This could not have been possible without following open source software -
+- [searchtweets-v2](https://github.com/twitterdev/search-tweets-python): For Twitter's API v2 wrapper
+- [vaderSentiment](https://github.com/cjhutto/vaderSentiment): For rule-based sentiment analysis
+- [transformers](https://github.com/huggingface/transformers): For text-classification pipeline
+- [atlassian-python-api](https://github.com/atlassian-api/atlassian-python-api): To interact with Jira
+- [elasticsearch](https://github.com/elastic/elasticsearch-py): To interact with Elasticsearch
+- [pydantic](https://github.com/samuelcolvin/pydantic): For data validation
+- [sqlalchemy](https://github.com/sqlalchemy/sqlalchemy): As SQL toolkit to access DB storage
+- [google-play-scraper](https://github.com/JoMingyu/google-play-scraper): To fetch the Google Play Store review without authentication
+- [praw](https://github.com/praw-dev/praw): For Reddit client
+- [reddit-rss-reader](https://github.com/lalitpagaria/reddit-rss-reader): For Reddit scrapping
+- [app-store-reviews-reader](https://github.com/lalitpagaria/app_store_reviews_reader): For App Store reviews scrapping
+- [slack-sdk](https://github.com/slackapi/python-slack-sdk): For slack integration
+- [presidio-anonymizer](https://github.com/microsoft/presidio): Personal information anonymizer
+- [GoogleNews](https://github.com/Iceloof/GoogleNews): For Google News integration
+- [python-facebook-api](https://github.com/sns-sdks/python-facebook): For facebook integration
+- [youtube-comment-downloader](https://github.com/egbertbouman/youtube-comment-downloader): For Youtube video comments extraction code
+- [dateparser](https://github.com/scrapinghub/dateparser): To parse date properly (where format is ambiguous)
\ No newline at end of file
diff --git a/obsei_module/obsei-master/CITATION.cff b/obsei_module/obsei-master/CITATION.cff
new file mode 100644
index 0000000000000000000000000000000000000000..bd12a46b8dc05be975f138e2357ebef65de9ade3
--- /dev/null
+++ b/obsei_module/obsei-master/CITATION.cff
@@ -0,0 +1,14 @@
+# YAML 1.2
+---
+authors:
+ -
+ family-names: Pagaria
+ given-names: Lalit
+
+cff-version: "1.1.0"
+license: "Apache-2.0"
+message: "If you use this software, please cite it using this metadata."
+repository-code: "https://github.com/obsei/obsei"
+title: "Obsei - a low code AI powered automation tool"
+version: "0.0.10"
+...
diff --git a/obsei_module/obsei-master/CNAME b/obsei_module/obsei-master/CNAME
new file mode 100644
index 0000000000000000000000000000000000000000..48c4fb7ad825704db946a83e64693071ebe454d7
--- /dev/null
+++ b/obsei_module/obsei-master/CNAME
@@ -0,0 +1 @@
+www.obsei.com
\ No newline at end of file
diff --git a/obsei_module/obsei-master/CODE_OF_CONDUCT.md b/obsei_module/obsei-master/CODE_OF_CONDUCT.md
new file mode 100644
index 0000000000000000000000000000000000000000..e8c5ad02324a0fa0778f625fd77f183f3c531ff7
--- /dev/null
+++ b/obsei_module/obsei-master/CODE_OF_CONDUCT.md
@@ -0,0 +1,128 @@
+# Contributor Covenant Code of Conduct
+
+## Our Pledge
+
+We as members, contributors, and leaders pledge to make participation in our
+community a harassment-free experience for everyone, regardless of age, body
+size, visible or invisible disability, ethnicity, sex characteristics, gender
+identity and expression, level of experience, education, socio-economic status,
+nationality, personal appearance, race, religion, or sexual identity
+and orientation.
+
+We pledge to act and interact in ways that contribute to an open, welcoming,
+diverse, inclusive, and healthy community.
+
+## Our Standards
+
+Examples of behavior that contributes to a positive environment for our
+community include:
+
+- Demonstrating empathy and kindness toward other people
+- Being respectful of differing opinions, viewpoints, and experiences
+- Giving and gracefully accepting constructive feedback
+- Accepting responsibility and apologizing to those affected by our mistakes,
+ and learning from the experience
+- Focusing on what is best not just for us as individuals, but for the
+ overall community
+
+Examples of unacceptable behavior include:
+
+- The use of sexualized language or imagery, and sexual attention or
+ advances of any kind
+- Trolling, insulting or derogatory comments, and personal or political attacks
+- Public or private harassment
+- Publishing others' private information, such as a physical or email
+ address, without their explicit permission
+- Other conduct which could reasonably be considered inappropriate in a
+ professional setting
+
+## Enforcement Responsibilities
+
+Community leaders are responsible for clarifying and enforcing our standards of
+acceptable behavior and will take appropriate and fair corrective action in
+response to any behavior that they deem inappropriate, threatening, offensive,
+or harmful.
+
+Community leaders have the right and responsibility to remove, edit, or reject
+comments, commits, code, wiki edits, issues, and other contributions that are
+not aligned to this Code of Conduct, and will communicate reasons for moderation
+decisions when appropriate.
+
+## Scope
+
+This Code of Conduct applies within all community spaces, and also applies when
+an individual is officially representing the community in public spaces.
+Examples of representing our community include using an official e-mail address,
+posting via an official social media account, or acting as an appointed
+representative at an online or offline event.
+
+## Enforcement
+
+Instances of abusive, harassing, or otherwise unacceptable behavior may be
+reported to the community leaders responsible for enforcement at
+obsei.tool@gmail.com
+All complaints will be reviewed and investigated promptly and fairly.
+
+All community leaders are obligated to respect the privacy and security of the
+reporter of any incident.
+
+## Enforcement Guidelines
+
+Community leaders will follow these Community Impact Guidelines in determining
+the consequences for any action they deem in violation of this Code of Conduct:
+
+### 1. Correction
+
+**Community Impact**: Use of inappropriate language or other behavior deemed
+unprofessional or unwelcome in the community.
+
+**Consequence**: A written warning from community leaders, providing
+clarity around the nature of the violation and an explanation of why the
+behavior was inappropriate. A public apology may be requested.
+
+### 2. Warning
+
+**Community Impact**: A violation through a single incident or series
+of actions.
+
+**Consequence**: A warning with consequences for continued behavior. No
+interaction with the people involved, including unsolicited interaction with
+those enforcing the Code of Conduct, for a specified period of time. This
+includes avoiding interactions in community spaces as well as external channels
+like social media. Violating these terms may lead to a temporary or
+permanent ban.
+
+### 3. Temporary Ban
+
+**Community Impact**: A serious violation of community standards, including
+sustained inappropriate behavior.
+
+**Consequence**: A temporary ban from any sort of interaction or public
+communication with the community for a specified period of time. No public or
+private interaction with the people involved, including unsolicited interaction
+with those enforcing the Code of Conduct, is allowed during this period.
+Violating these terms may lead to a permanent ban.
+
+### 4. Permanent Ban
+
+**Community Impact**: Demonstrating a pattern of violation of community
+standards, including sustained inappropriate behavior, harassment of an
+individual, or aggression toward or disparagement of classes of individuals.
+
+**Consequence**: A permanent ban from any sort of public interaction within
+the community.
+
+## Attribution
+
+This Code of Conduct is adapted from the [Contributor Covenant][homepage],
+version 2.0, available at
+https://www.contributor-covenant.org/version/2/0/code_of_conduct.html.
+
+Community Impact Guidelines were inspired by [Mozilla's code of conduct
+enforcement ladder](https://github.com/mozilla/diversity).
+
+[homepage]: https://www.contributor-covenant.org
+
+For answers to common questions about this code of conduct, see the FAQ at
+https://www.contributor-covenant.org/faq. Translations are available at
+https://www.contributor-covenant.org/translations.
diff --git a/obsei_module/obsei-master/CONTRIBUTING.md b/obsei_module/obsei-master/CONTRIBUTING.md
new file mode 100644
index 0000000000000000000000000000000000000000..f0afbb0a9ac896f8afb37430e389450efe498926
--- /dev/null
+++ b/obsei_module/obsei-master/CONTRIBUTING.md
@@ -0,0 +1,103 @@
+# 👐 Contributing to Obsei
+
+First off, thank you for even considering contributing to this package, every contribution big or small is greatly appreciated.
+Community contributions are what keep projects like this fueled and constantly improving, so a big thanks to you!
+
+Below are some sections detailing the guidelines we'd like you to follow to make your contribution as seamless as possible.
+
+- [Code of Conduct](#coc)
+- [Asking a Question and Discussions](#question)
+- [Issues, Bugs, and Feature Requests](#issue)
+- [Submission Guidelines](#submit)
+- [Code Style and Formatting](#code)
+- [Contributor License Agreement](#cla)
+
+## 📜 Code of Conduct
+
+The [Code of Conduct](https://github.com/obsei/obsei/blob/master/CODE_OF_CONDUCT.md) applies within all community spaces.
+If you are not familiar with our Code of Conduct policy, take a minute to read the policy before starting with your first contribution.
+
+## 🗣️ Query or Discussion
+
+We would like to use [Github discussions](https://github.com/obsei/obsei/discussions) as the central hub for all
+community discussions, questions, and everything else in between. While Github discussions is a new service (as of 2021)
+we believe that it really helps keep this repo as one single source to find all relevant information. Our hope is that
+discussion page functions as a record of all the conversations that help contribute to the project's development.
+
+If you are new to [Github discussions](https://github.com/obsei/obsei/discussions) it is a very similar experience
+to Stack Overflow with an added element of general discussion and discourse rather than solely being question and answer based.
+
+## 🪲 Issues, Bugs, and Feature Requests
+
+We are very open to community contributions and appreciate anything that improves **Obsei**. This includes fixings typos, adding missing documentation, fixing bugs or adding new features.
+To avoid unnecessary work on either side, please stick to the following process:
+
+1. If you feel like your issue is not specific and more of a general question about a design decision, or algorithm implementation maybe start a [discussion](https://github.com/obsei/obsei/discussions) instead, this helps keep the issues less cluttered and encourages more open-ended conversation.
+2. Check if there is already [an related issue](https://github.com/obsei/obsei/issues).
+3. If there is not, open a new one to start a discussion. Some features might be a nice idea, but don't fit in the scope of Obsei and we hate to close finished PRs.
+4. If we came to the conclusion to move forward with your issue, we will be happy to accept a pull request. Make sure you create a pull request in an early draft version and ask for feedback.
+5. Verify that all tests in the CI pass (and add new ones if you implement anything new)
+
+See [below](#submit) for some guidelines.
+
+## ✉️ Submission Guidelines
+
+### Submitting an Issue
+
+Before you submit your issue search the archive, maybe your question was already answered.
+
+If your issue appears to be a bug, and hasn't been reported, open a new issue.
+Help us to maximize the effort we can spend fixing issues and adding new
+features, by not reporting duplicate issues. Providing the following information will increase the
+chances of your issue being dealt with quickly:
+
+- **Describe the bug** - A clear and concise description of what the bug is.
+- **To Reproduce**- Steps to reproduce the behavior.
+- **Expected behavior** - A clear and concise description of what you expected to happen.
+- **Environment**
+ - Obsei version
+ - Python version
+ - OS
+- **Suggest a Fix** - if you can't fix the bug yourself, perhaps you can point to what might be
+ causing the problem (line of code or commit)
+
+When you submit a PR you will be presented with a PR template, please fill this in as best you can.
+
+### Submitting a Pull Request
+
+Before you submit your pull request consider the following guidelines:
+
+- Search [GitHub](https://github.com/obsei/obsei/pulls) for an open or closed Pull Request
+ that relates to your submission. You don't want to duplicate effort.
+- Fork the main repo if not already done
+- Rebase fork with `upstream master`
+- Create new branch and add the changes in that branch
+- Add supporting test cases
+- Follow our [Coding Rules](#rules).
+- Avoid checking in files that shouldn't be tracked (e.g `dist`, `build`, `.tmp`, `.idea`).
+ We recommend using a [global](#global-gitignore) gitignore for this.
+- Before you commit please run the test suite and make sure all tests are passing.
+- Format your code appropriately:
+ - This package uses [black](https://black.readthedocs.io/en/stable/) as its formatter.
+ In order to format your code with black run `black . ` from the root of the package.
+- Run `pre-commit run --all-files` if you're adding new hooks to pre-commit config file. By default, pre-commit will run on modified files when commiting changes.
+- Commit your changes using a descriptive commit message.
+- In GitHub, send a pull request to `obsei:master`.
+- If we suggest changes then:
+ - Make the required updates.
+ - Rebase your branch and force push to your GitHub repository (this will update your Pull Request):
+
+That's it! Thank you for your contribution!
+
+## ✅ Coding Rules
+
+We generally follow the [Google Python style guide](http://google.github.io/styleguide/pyguide.html).
+
+## 📝 Contributor License Agreement
+
+That we do not have any potential problems later it is sadly necessary to sign a [Contributor License Agreement](CONTRIBUTOR_LICENSE_AGREEMENT.md). That can be done literally with the push of a button.
+
+---
+
+_This guide was inspired by the [transformers-interpret](https://github.com/cdpierse/transformers-interpret/blob/master/CONTRIBUTING.md),
+[Haystack](https://github.com/deepset-ai/haystack/blob/master/CONTRIBUTING.md) and [n8n](https://github.com/n8n-io/n8n/blob/master/CONTRIBUTOR_LICENSE_AGREEMENT.md)_
diff --git a/obsei_module/obsei-master/CONTRIBUTOR_LICENSE_AGREEMENT.md b/obsei_module/obsei-master/CONTRIBUTOR_LICENSE_AGREEMENT.md
new file mode 100644
index 0000000000000000000000000000000000000000..8b4784f57938ed30cbc0de319b9c90df121b3632
--- /dev/null
+++ b/obsei_module/obsei-master/CONTRIBUTOR_LICENSE_AGREEMENT.md
@@ -0,0 +1,3 @@
+# Obsei Contributor License Agreement
+
+I give Obsei's Creator permission to license my contributions to any terms they like. I am giving them this license in order to make it possible for them to accept my contributions into their project.
\ No newline at end of file
diff --git a/obsei_module/obsei-master/Dockerfile b/obsei_module/obsei-master/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..b16cbfd7580a3c384e93b690c80a8e4812d1a57f
--- /dev/null
+++ b/obsei_module/obsei-master/Dockerfile
@@ -0,0 +1,38 @@
+# This is Docker file to Obsei SDK with dependencies installed
+FROM python:3.10-slim-bullseye
+
+RUN useradd --create-home user
+WORKDIR /home/user
+
+# env variable
+ENV PIP_DISABLE_PIP_VERSION_CHECK 1
+ENV PIP_NO_CACHE_DIR 1
+ENV WORKFLOW_SCRIPT '/home/user/obsei/process_workflow.py'
+ENV OBSEI_CONFIG_PATH ""
+ENV OBSEI_CONFIG_FILENAME ""
+
+
+# Hack to install jre on debian
+RUN mkdir -p /usr/share/man/man1
+
+# install few required tools
+RUN apt-get update && apt-get install -y --no-install-recommends curl git pkg-config cmake libncurses5 g++ \
+ && apt-get clean autoclean && apt-get autoremove -y \
+ && rm -rf /var/lib/{apt,dpkg,cache,log}/
+
+# install as a package
+COPY pyproject.toml README.md /home/user/
+RUN pip install --upgrade pip
+
+# copy README
+COPY README.md /home/user/
+
+# copy code
+COPY obsei /home/user/obsei
+RUN pip install -e .[all]
+
+
+USER user
+
+# cmd for running the API
+CMD ["sh", "-c", "python ${WORKFLOW_SCRIPT}"]
diff --git a/obsei_module/obsei-master/LICENSE b/obsei_module/obsei-master/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..fecb6d71f505d183b3e4f5bbda806637c660d0f1
--- /dev/null
+++ b/obsei_module/obsei-master/LICENSE
@@ -0,0 +1,201 @@
+ Apache License
+ Version 2.0, January 2004
+ http://www.apache.org/licenses/
+
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+ 1. Definitions.
+
+ "License" shall mean the terms and conditions for use, reproduction,
+ and distribution as defined by Sections 1 through 9 of this document.
+
+ "Licensor" shall mean the copyright owner or entity authorized by
+ the copyright owner that is granting the License.
+
+ "Legal Entity" shall mean the union of the acting entity and all
+ other entities that control, are controlled by, or are under common
+ control with that entity. For the purposes of this definition,
+ "control" means (i) the power, direct or indirect, to cause the
+ direction or management of such entity, whether by contract or
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
+ outstanding shares, or (iii) beneficial ownership of such entity.
+
+ "You" (or "Your") shall mean an individual or Legal Entity
+ exercising permissions granted by this License.
+
+ "Source" form shall mean the preferred form for making modifications,
+ including but not limited to software source code, documentation
+ source, and configuration files.
+
+ "Object" form shall mean any form resulting from mechanical
+ transformation or translation of a Source form, including but
+ not limited to compiled object code, generated documentation,
+ and conversions to other media types.
+
+ "Work" shall mean the work of authorship, whether in Source or
+ Object form, made available under the License, as indicated by a
+ copyright notice that is included in or attached to the work
+ (an example is provided in the Appendix below).
+
+ "Derivative Works" shall mean any work, whether in Source or Object
+ form, that is based on (or derived from) the Work and for which the
+ editorial revisions, annotations, elaborations, or other modifications
+ represent, as a whole, an original work of authorship. For the purposes
+ of this License, Derivative Works shall not include works that remain
+ separable from, or merely link (or bind by name) to the interfaces of,
+ the Work and Derivative Works thereof.
+
+ "Contribution" shall mean any work of authorship, including
+ the original version of the Work and any modifications or additions
+ to that Work or Derivative Works thereof, that is intentionally
+ submitted to Licensor for inclusion in the Work by the copyright owner
+ or by an individual or Legal Entity authorized to submit on behalf of
+ the copyright owner. For the purposes of this definition, "submitted"
+ means any form of electronic, verbal, or written communication sent
+ to the Licensor or its representatives, including but not limited to
+ communication on electronic mailing lists, source code control systems,
+ and issue tracking systems that are managed by, or on behalf of, the
+ Licensor for the purpose of discussing and improving the Work, but
+ excluding communication that is conspicuously marked or otherwise
+ designated in writing by the copyright owner as "Not a Contribution."
+
+ "Contributor" shall mean Licensor and any individual or Legal Entity
+ on behalf of whom a Contribution has been received by Licensor and
+ subsequently incorporated within the Work.
+
+ 2. Grant of Copyright License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ copyright license to reproduce, prepare Derivative Works of,
+ publicly display, publicly perform, sublicense, and distribute the
+ Work and such Derivative Works in Source or Object form.
+
+ 3. Grant of Patent License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ (except as stated in this section) patent license to make, have made,
+ use, offer to sell, sell, import, and otherwise transfer the Work,
+ where such license applies only to those patent claims licensable
+ by such Contributor that are necessarily infringed by their
+ Contribution(s) alone or by combination of their Contribution(s)
+ with the Work to which such Contribution(s) was submitted. If You
+ institute patent litigation against any entity (including a
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
+ or a Contribution incorporated within the Work constitutes direct
+ or contributory patent infringement, then any patent licenses
+ granted to You under this License for that Work shall terminate
+ as of the date such litigation is filed.
+
+ 4. Redistribution. You may reproduce and distribute copies of the
+ Work or Derivative Works thereof in any medium, with or without
+ modifications, and in Source or Object form, provided that You
+ meet the following conditions:
+
+ (a) You must give any other recipients of the Work or
+ Derivative Works a copy of this License; and
+
+ (b) You must cause any modified files to carry prominent notices
+ stating that You changed the files; and
+
+ (c) You must retain, in the Source form of any Derivative Works
+ that You distribute, all copyright, patent, trademark, and
+ attribution notices from the Source form of the Work,
+ excluding those notices that do not pertain to any part of
+ the Derivative Works; and
+
+ (d) If the Work includes a "NOTICE" text file as part of its
+ distribution, then any Derivative Works that You distribute must
+ include a readable copy of the attribution notices contained
+ within such NOTICE file, excluding those notices that do not
+ pertain to any part of the Derivative Works, in at least one
+ of the following places: within a NOTICE text file distributed
+ as part of the Derivative Works; within the Source form or
+ documentation, if provided along with the Derivative Works; or,
+ within a display generated by the Derivative Works, if and
+ wherever such third-party notices normally appear. The contents
+ of the NOTICE file are for informational purposes only and
+ do not modify the License. You may add Your own attribution
+ notices within Derivative Works that You distribute, alongside
+ or as an addendum to the NOTICE text from the Work, provided
+ that such additional attribution notices cannot be construed
+ as modifying the License.
+
+ You may add Your own copyright statement to Your modifications and
+ may provide additional or different license terms and conditions
+ for use, reproduction, or distribution of Your modifications, or
+ for any such Derivative Works as a whole, provided Your use,
+ reproduction, and distribution of the Work otherwise complies with
+ the conditions stated in this License.
+
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
+ any Contribution intentionally submitted for inclusion in the Work
+ by You to the Licensor shall be under the terms and conditions of
+ this License, without any additional terms or conditions.
+ Notwithstanding the above, nothing herein shall supersede or modify
+ the terms of any separate license agreement you may have executed
+ with Licensor regarding such Contributions.
+
+ 6. Trademarks. This License does not grant permission to use the trade
+ names, trademarks, service marks, or product names of the Licensor,
+ except as required for reasonable and customary use in describing the
+ origin of the Work and reproducing the content of the NOTICE file.
+
+ 7. Disclaimer of Warranty. Unless required by applicable law or
+ agreed to in writing, Licensor provides the Work (and each
+ Contributor provides its Contributions) on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ implied, including, without limitation, any warranties or conditions
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+ PARTICULAR PURPOSE. You are solely responsible for determining the
+ appropriateness of using or redistributing the Work and assume any
+ risks associated with Your exercise of permissions under this License.
+
+ 8. Limitation of Liability. In no event and under no legal theory,
+ whether in tort (including negligence), contract, or otherwise,
+ unless required by applicable law (such as deliberate and grossly
+ negligent acts) or agreed to in writing, shall any Contributor be
+ liable to You for damages, including any direct, indirect, special,
+ incidental, or consequential damages of any character arising as a
+ result of this License or out of the use or inability to use the
+ Work (including but not limited to damages for loss of goodwill,
+ work stoppage, computer failure or malfunction, or any and all
+ other commercial damages or losses), even if such Contributor
+ has been advised of the possibility of such damages.
+
+ 9. Accepting Warranty or Additional Liability. While redistributing
+ the Work or Derivative Works thereof, You may choose to offer,
+ and charge a fee for, acceptance of support, warranty, indemnity,
+ or other liability obligations and/or rights consistent with this
+ License. However, in accepting such obligations, You may act only
+ on Your own behalf and on Your sole responsibility, not on behalf
+ of any other Contributor, and only if You agree to indemnify,
+ defend, and hold each Contributor harmless for any liability
+ incurred by, or claims asserted against, such Contributor by reason
+ of your accepting any such warranty or additional liability.
+
+ END OF TERMS AND CONDITIONS
+
+ APPENDIX: How to apply the Apache License to your work.
+
+ To apply the Apache License to your work, attach the following
+ boilerplate notice, with the fields enclosed by brackets "[]"
+ replaced with your own identifying information. (Don't include
+ the brackets!) The text should be enclosed in the appropriate
+ comment syntax for the file format. We also recommend that a
+ file or class name and description of purpose be included on the
+ same "printed page" as the copyright notice for easier
+ identification within third-party archives.
+
+ Copyright 2020-2022 Oraika Technologies Private Limited (https://www.oraika.com)
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
diff --git a/obsei_module/obsei-master/MANIFEST.in b/obsei_module/obsei-master/MANIFEST.in
new file mode 100644
index 0000000000000000000000000000000000000000..84c71247ce333d3b19e1265f4da3fd130972bc35
--- /dev/null
+++ b/obsei_module/obsei-master/MANIFEST.in
@@ -0,0 +1,3 @@
+include LICENSE
+include requirements.txt
+include README.md
diff --git a/obsei_module/obsei-master/README.md b/obsei_module/obsei-master/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..48c602f538183d2bba5f019e2c81cb32946cb71b
--- /dev/null
+++ b/obsei_module/obsei-master/README.md
@@ -0,0 +1,1067 @@
+
+
+---
+
+![](https://raw.githubusercontent.com/obsei/obsei-resources/master/gifs/obsei_flow.gif)
+
+---
+
+
+Note: Obsei is still in alpha stage hence carefully use it in Production. Also, as it is constantly undergoing development hence master branch may contain many breaking changes. Please use released version.
+
+
+---
+
+**Obsei** (pronounced "Ob see" | /əb-'sē/) is an open-source, low-code, AI powered automation tool. _Obsei_ consists of -
+
+- **Observer**: Collect unstructured data from various sources like tweets from Twitter, Subreddit comments on Reddit, page post's comments from Facebook, App Stores reviews, Google reviews, Amazon reviews, News, Website, etc.
+- **Analyzer**: Analyze unstructured data collected with various AI tasks like classification, sentiment analysis, translation, PII, etc.
+- **Informer**: Send analyzed data to various destinations like ticketing platforms, data storage, dataframe, etc so that the user can take further actions and perform analysis on the data.
+
+All the Observers can store their state in databases (Sqlite, Postgres, MySQL, etc.), making Obsei suitable for scheduled jobs or serverless applications.
+
+![Obsei diagram](https://raw.githubusercontent.com/obsei/obsei-resources/master/images/Obsei_diagram.png)
+
+### Future direction -
+
+- Text, Image, Audio, Documents and Video oriented workflows
+- Collect data from every possible private and public channels
+- Add every possible workflow to an AI downstream application to automate manual cognitive workflows
+
+## Use cases
+
+_Obsei_ use cases are following, but not limited to -
+
+- Social listening: Listening about social media posts, comments, customer feedback, etc.
+- Alerting/Notification: To get auto-alerts for events such as customer complaints, qualified sales leads, etc.
+- Automatic customer issue creation based on customer complaints on Social Media, Email, etc.
+- Automatic assignment of proper tags to tickets based content of customer complaint for example login issue, sign up issue, delivery issue, etc.
+- Extraction of deeper insight from feedbacks on various platforms
+- Market research
+- Creation of dataset for various AI tasks
+- Many more based on creativity 💡
+
+## Installation
+
+### Prerequisite
+
+Install the following (if not present already) -
+
+- Install [Python 3.7+](https://www.python.org/downloads/)
+- Install [PIP](https://pip.pypa.io/en/stable/installing/)
+
+### Install Obsei
+
+You can install Obsei either via PIP or Conda based on your preference.
+To install latest released version -
+
+```shell
+pip install obsei[all]
+```
+
+Install from master branch (if you want to try the latest features) -
+
+```shell
+git clone https://github.com/obsei/obsei.git
+cd obsei
+pip install --editable .[all]
+```
+
+Note: `all` option will install all the dependencies which might not be needed for your workflow, alternatively
+following options are available to install minimal dependencies as per need -
+ - `pip install obsei[source]`: To install dependencies related to all observers
+ - `pip install obsei[sink]`: To install dependencies related to all informers
+ - `pip install obsei[analyzer]`: To install dependencies related to all analyzers, it will install pytorch as well
+ - `pip install obsei[twitter-api]`: To install dependencies related to Twitter observer
+ - `pip install obsei[google-play-scraper]`: To install dependencies related to Play Store review scrapper observer
+ - `pip install obsei[google-play-api]`: To install dependencies related to Google official play store review API based observer
+ - `pip install obsei[app-store-scraper]`: To install dependencies related to Apple App Store review scrapper observer
+ - `pip install obsei[reddit-scraper]`: To install dependencies related to Reddit post and comment scrapper observer
+ - `pip install obsei[reddit-api]`: To install dependencies related to Reddit official api based observer
+ - `pip install obsei[pandas]`: To install dependencies related to TSV/CSV/Pandas based observer and informer
+ - `pip install obsei[google-news-scraper]`: To install dependencies related to Google news scrapper observer
+ - `pip install obsei[facebook-api]`: To install dependencies related to Facebook official page post and comments api based observer
+ - `pip install obsei[atlassian-api]`: To install dependencies related to Jira official api based informer
+ - `pip install obsei[elasticsearch]`: To install dependencies related to elasticsearch informer
+ - `pip install obsei[slack-api]`:To install dependencies related to Slack official api based informer
+
+You can also mix multiple dependencies together in single installation command. For example to install dependencies
+Twitter observer, all analyzer, and Slack informer use following command -
+```shell
+pip install obsei[twitter-api, analyzer, slack-api]
+```
+
+
+## How to use
+
+Expand the following steps and create a workflow -
+
+Step 1: Configure Source/Observer
+
+
+
Twitter
+
+```python
+from obsei.source.twitter_source import TwitterCredentials, TwitterSource, TwitterSourceConfig
+
+# initialize twitter source config
+source_config = TwitterSourceConfig(
+ keywords=["issue"], # Keywords, @user or #hashtags
+ lookup_period="1h", # Lookup period from current time, format: `` (day|hour|minute)
+ cred_info=TwitterCredentials(
+ # Enter your twitter consumer key and secret. Get it from https://developer.twitter.com/en/apply-for-access
+ consumer_key="",
+ consumer_secret="",
+ bearer_token='',
+ )
+)
+
+# initialize tweets retriever
+source = TwitterSource()
+```
+
+
+
+
+
+
Youtube Scrapper
+
+```python
+from obsei.source.youtube_scrapper import YoutubeScrapperSource, YoutubeScrapperConfig
+
+# initialize Youtube source config
+source_config = YoutubeScrapperConfig(
+ video_url="https://www.youtube.com/watch?v=uZfns0JIlFk", # Youtube video URL
+ fetch_replies=True, # Fetch replies to comments
+ max_comments=10, # Total number of comments and replies to fetch
+ lookup_period="1Y", # Lookup period from current time, format: `` (day|hour|minute|month|year)
+)
+
+# initialize Youtube comments retriever
+source = YoutubeScrapperSource()
+```
+
+
+
+
+
+
Facebook
+
+```python
+from obsei.source.facebook_source import FacebookCredentials, FacebookSource, FacebookSourceConfig
+
+# initialize facebook source config
+source_config = FacebookSourceConfig(
+ page_id="110844591144719", # Facebook page id, for example this one for Obsei
+ lookup_period="1h", # Lookup period from current time, format: `` (day|hour|minute)
+ cred_info=FacebookCredentials(
+ # Enter your facebook app_id, app_secret and long_term_token. Get it from https://developers.facebook.com/apps/
+ app_id="",
+ app_secret="",
+ long_term_token="",
+ )
+)
+
+# initialize facebook post comments retriever
+source = FacebookSource()
+```
+
+
+
+
+
+
Email
+
+```python
+from obsei.source.email_source import EmailConfig, EmailCredInfo, EmailSource
+
+# initialize email source config
+source_config = EmailConfig(
+ # List of IMAP servers for most commonly used email providers
+ # https://www.systoolsgroup.com/imap/
+ # Also, if you're using a Gmail account then make sure you allow less secure apps on your account -
+ # https://myaccount.google.com/lesssecureapps?pli=1
+ # Also enable IMAP access -
+ # https://mail.google.com/mail/u/0/#settings/fwdandpop
+ imap_server="imap.gmail.com", # Enter IMAP server
+ cred_info=EmailCredInfo(
+ # Enter your email account username and password
+ username="",
+ password=""
+ ),
+ lookup_period="1h" # Lookup period from current time, format: `` (day|hour|minute)
+)
+
+# initialize email retriever
+source = EmailSource()
+```
+
+
+
+
+
+
Google Maps Reviews Scrapper
+
+```python
+from obsei.source.google_maps_reviews import OSGoogleMapsReviewsSource, OSGoogleMapsReviewsConfig
+
+# initialize Outscrapper Maps review source config
+source_config = OSGoogleMapsReviewsConfig(
+ # Collect API key from https://outscraper.com/
+ api_key="",
+ # Enter Google Maps link or place id
+ # For example below is for the "Taj Mahal"
+ queries=["https://www.google.co.in/maps/place/Taj+Mahal/@27.1751496,78.0399535,17z/data=!4m5!3m4!1s0x39747121d702ff6d:0xdd2ae4803f767dde!8m2!3d27.1751448!4d78.0421422"],
+ number_of_reviews=10,
+)
+
+
+# initialize Outscrapper Maps review retriever
+source = OSGoogleMapsReviewsSource()
+```
+
+
+
+
+
+
AppStore Reviews Scrapper
+
+```python
+from obsei.source.appstore_scrapper import AppStoreScrapperConfig, AppStoreScrapperSource
+
+# initialize app store source config
+source_config = AppStoreScrapperConfig(
+ # Need two parameters app_id and country.
+ # `app_id` can be found at the end of the url of app in app store.
+ # For example - https://apps.apple.com/us/app/xcode/id497799835
+ # `310633997` is the app_id for xcode and `us` is country.
+ countries=["us"],
+ app_id="310633997",
+ lookup_period="1h" # Lookup period from current time, format: `` (day|hour|minute)
+)
+
+
+# initialize app store reviews retriever
+source = AppStoreScrapperSource()
+```
+
+
+
+
+
+
Play Store Reviews Scrapper
+
+```python
+from obsei.source.playstore_scrapper import PlayStoreScrapperConfig, PlayStoreScrapperSource
+
+# initialize play store source config
+source_config = PlayStoreScrapperConfig(
+ # Need two parameters package_name and country.
+ # `package_name` can be found at the end of the url of app in play store.
+ # For example - https://play.google.com/store/apps/details?id=com.google.android.gm&hl=en&gl=US
+ # `com.google.android.gm` is the package_name for xcode and `us` is country.
+ countries=["us"],
+ package_name="com.google.android.gm",
+ lookup_period="1h" # Lookup period from current time, format: `` (day|hour|minute)
+)
+
+# initialize play store reviews retriever
+source = PlayStoreScrapperSource()
+```
+
+
+
+
+
+
Reddit
+
+```python
+from obsei.source.reddit_source import RedditConfig, RedditSource, RedditCredInfo
+
+# initialize reddit source config
+source_config = RedditConfig(
+ subreddits=["wallstreetbets"], # List of subreddits
+ # Reddit account username and password
+ # You can also enter reddit client_id and client_secret or refresh_token
+ # Create credential at https://www.reddit.com/prefs/apps
+ # Also refer https://praw.readthedocs.io/en/latest/getting_started/authentication.html
+ # Currently Password Flow, Read Only Mode and Saved Refresh Token Mode are supported
+ cred_info=RedditCredInfo(
+ username="",
+ password=""
+ ),
+ lookup_period="1h" # Lookup period from current time, format: `` (day|hour|minute)
+)
+
+# initialize reddit retriever
+source = RedditSource()
+```
+
+
+
+
+
+
Reddit Scrapper
+
+Note: Reddit heavily rate limit scrappers, hence use it to fetch small data during long period
+
+```python
+from obsei.source.reddit_scrapper import RedditScrapperConfig, RedditScrapperSource
+
+# initialize reddit scrapper source config
+source_config = RedditScrapperConfig(
+ # Reddit subreddit, search etc rss url. For proper url refer following link -
+ # Refer https://www.reddit.com/r/pathogendavid/comments/tv8m9/pathogendavids_guide_to_rss_and_reddit/
+ url="https://www.reddit.com/r/wallstreetbets/comments/.rss?sort=new",
+ lookup_period="1h" # Lookup period from current time, format: `` (day|hour|minute)
+)
+
+# initialize reddit retriever
+source = RedditScrapperSource()
+```
+
+
+
+
+
+
Google News
+
+```python
+from obsei.source.google_news_source import GoogleNewsConfig, GoogleNewsSource
+
+# initialize Google News source config
+source_config = GoogleNewsConfig(
+ query='bitcoin',
+ max_results=5,
+ # To fetch full article text enable `fetch_article` flag
+ # By default google news gives title and highlight
+ fetch_article=True,
+ # proxy='http://127.0.0.1:8080'
+)
+
+# initialize Google News retriever
+source = GoogleNewsSource()
+```
+
+
+
Pandas DataFrame
+
+```python
+import pandas as pd
+from obsei.source.pandas_source import PandasSource, PandasSourceConfig
+
+# Initialize your Pandas DataFrame from your sources like csv, excel, sql etc
+# In following example we are reading csv which have two columns title and text
+csv_file = "https://raw.githubusercontent.com/deepset-ai/haystack/master/tutorials/small_generator_dataset.csv"
+dataframe = pd.read_csv(csv_file)
+
+# initialize pandas sink config
+sink_config = PandasSourceConfig(
+ dataframe=dataframe,
+ include_columns=["score"],
+ text_columns=["name", "degree"],
+)
+
+# initialize pandas sink
+sink = PandasSource()
+```
+
+
+
+
+
+
+
+
+
+Step 2: Configure Analyzer
+
+Note: To run transformers in an offline mode, check [transformers offline mode](https://huggingface.co/transformers/installation.html#offline-mode).
+
+
Some analyzer support GPU and to utilize pass device parameter.
+List of possible values of device parameter (default value auto):
+
+
auto: GPU (cuda:0) will be used if available otherwise CPU will be used
+
cpu: CPU will be used
+
cuda:{id} - GPU will be used with provided CUDA device id
+
+
+
+
+
Text Classification
+
+Text classification: Classify text into user provided categories.
+
+```python
+from obsei.analyzer.classification_analyzer import ClassificationAnalyzerConfig, ZeroShotClassificationAnalyzer
+
+# initialize classification analyzer config
+# It can also detect sentiments if "positive" and "negative" labels are added.
+analyzer_config=ClassificationAnalyzerConfig(
+ labels=["service", "delay", "performance"],
+)
+
+# initialize classification analyzer
+# For supported models refer https://huggingface.co/models?filter=zero-shot-classification
+text_analyzer = ZeroShotClassificationAnalyzer(
+ model_name_or_path="typeform/mobilebert-uncased-mnli",
+ device="auto"
+)
+```
+
+
+
+
+
+
Sentiment Analyzer
+
+Sentiment Analyzer: Detect the sentiment of the text. Text classification can also perform sentiment analysis but if you don't want to use heavy-duty NLP model then use less resource hungry dictionary based Vader Sentiment detector.
+
+```python
+from obsei.analyzer.sentiment_analyzer import VaderSentimentAnalyzer
+
+# Vader does not need any configuration settings
+analyzer_config=None
+
+# initialize vader sentiment analyzer
+text_analyzer = VaderSentimentAnalyzer()
+```
+
+
+
+
+
+
NER Analyzer
+
+NER (Named-Entity Recognition) Analyzer: Extract information and classify named entities mentioned in text into pre-defined categories such as person names, organizations, locations, medical codes, time expressions, quantities, monetary values, percentages, etc
+
+```python
+from obsei.analyzer.ner_analyzer import NERAnalyzer
+
+# NER analyzer does not need configuration settings
+analyzer_config=None
+
+# initialize ner analyzer
+# For supported models refer https://huggingface.co/models?filter=token-classification
+text_analyzer = NERAnalyzer(
+ model_name_or_path="elastic/distilbert-base-cased-finetuned-conll03-english",
+ device = "auto"
+)
+```
+
+
+
PII Anonymizer
+
+```python
+from obsei.analyzer.pii_analyzer import PresidioEngineConfig, PresidioModelConfig, \
+ PresidioPIIAnalyzer, PresidioPIIAnalyzerConfig
+
+# initialize pii analyzer's config
+analyzer_config = PresidioPIIAnalyzerConfig(
+ # Whether to return only pii analysis or anonymize text
+ analyze_only=False,
+ # Whether to return detail information about anonymization decision
+ return_decision_process=True
+)
+
+# initialize pii analyzer
+analyzer = PresidioPIIAnalyzer(
+ engine_config=PresidioEngineConfig(
+ # spacy and stanza nlp engines are supported
+ # For more info refer
+ # https://microsoft.github.io/presidio/analyzer/developing_recognizers/#utilize-spacy-or-stanza
+ nlp_engine_name="spacy",
+ # Update desired spacy model and language
+ models=[PresidioModelConfig(model_name="en_core_web_lg", lang_code="en")]
+ )
+)
+```
+
+
+
+
+
+
Dummy Analyzer
+
+Dummy Analyzer: Does nothing. Its simply used for transforming the input (TextPayload) to output (TextPayload) and adding the user supplied dummy data.
+
+```python
+from obsei.analyzer.dummy_analyzer import DummyAnalyzer, DummyAnalyzerConfig
+
+# initialize dummy analyzer's configuration settings
+analyzer_config = DummyAnalyzerConfig()
+
+# initialize dummy analyzer
+analyzer = DummyAnalyzer()
+```
+
+
+
+
+
+
+
+
+
+Step 3: Configure Sink/Informer
+
+
+
Slack
+
+```python
+from obsei.sink.slack_sink import SlackSink, SlackSinkConfig
+
+# initialize slack sink config
+sink_config = SlackSinkConfig(
+ # Provide slack bot/app token
+ # For more detail refer https://slack.com/intl/en-de/help/articles/215770388-Create-and-regenerate-API-tokens
+ slack_token="",
+ # To get channel id refer https://stackoverflow.com/questions/40940327/what-is-the-simplest-way-to-find-a-slack-team-id-and-a-channel-id
+ channel_id="C01LRS6CT9Q"
+)
+
+# initialize slack sink
+sink = SlackSink()
+```
+
+
+
+
+
+
Zendesk
+
+```python
+from obsei.sink.zendesk_sink import ZendeskSink, ZendeskSinkConfig, ZendeskCredInfo
+
+# initialize zendesk sink config
+sink_config = ZendeskSinkConfig(
+ # provide zendesk domain
+ domain="zendesk.com",
+ # provide subdomain if you have one
+ subdomain=None,
+ # Enter zendesk user details
+ cred_info=ZendeskCredInfo(
+ email="",
+ password=""
+ )
+)
+
+# initialize zendesk sink
+sink = ZendeskSink()
+```
+
+
+
+
+
+
Jira
+
+```python
+from obsei.sink.jira_sink import JiraSink, JiraSinkConfig
+
+# For testing purpose you can start jira server locally
+# Refer https://developer.atlassian.com/server/framework/atlassian-sdk/atlas-run-standalone/
+
+# initialize Jira sink config
+sink_config = JiraSinkConfig(
+ url="http://localhost:2990/jira", # Jira server url
+ # Jira username & password for user who have permission to create issue
+ username="",
+ password="",
+ # Which type of issue to be created
+ # For more information refer https://support.atlassian.com/jira-cloud-administration/docs/what-are-issue-types/
+ issue_type={"name": "Task"},
+ # Under which project issue to be created
+ # For more information refer https://support.atlassian.com/jira-software-cloud/docs/what-is-a-jira-software-project/
+ project={"key": "CUS"},
+)
+
+# initialize Jira sink
+sink = JiraSink()
+```
+
+
+
+
+
+
ElasticSearch
+
+```python
+from obsei.sink.elasticsearch_sink import ElasticSearchSink, ElasticSearchSinkConfig
+
+# For testing purpose you can start Elasticsearch server locally via docker
+# `docker run -d --name elasticsearch -p 9200:9200 -e "discovery.type=single-node" elasticsearch:8.5.0`
+
+# initialize Elasticsearch sink config
+sink_config = ElasticSearchSinkConfig(
+ # Elasticsearch server
+ hosts="http://localhost:9200",
+ # Index name, it will create if not exist
+ index_name="test",
+)
+
+# initialize Elasticsearch sink
+sink = ElasticSearchSink()
+```
+
+
+
+
+
+
Http
+
+```python
+from obsei.sink.http_sink import HttpSink, HttpSinkConfig
+
+# For testing purpose you can create mock http server via postman
+# For more details refer https://learning.postman.com/docs/designing-and-developing-your-api/mocking-data/setting-up-mock/
+
+# initialize http sink config (Currently only POST call is supported)
+sink_config = HttpSinkConfig(
+ # provide http server url
+ url="https://localhost:8080/api/path",
+ # Here you can add headers you would like to pass with request
+ headers={
+ "Content-type": "application/json"
+ }
+)
+
+# To modify or converting the payload, create convertor class
+# Refer obsei.sink.dailyget_sink.PayloadConvertor for example
+
+# initialize http sink
+sink = HttpSink()
+```
+
+
+
+
+
+
+Step 4: Join and create workflow
+
+`source` will fetch data from the selected source, then feed it to the `analyzer` for processing, whose output we feed into a `sink` to get notified at that sink.
+
+```python
+# Uncomment if you want logger
+# import logging
+# import sys
+# logger = logging.getLogger(__name__)
+# logging.basicConfig(stream=sys.stdout, level=logging.INFO)
+
+# This will fetch information from configured source ie twitter, app store etc
+source_response_list = source.lookup(source_config)
+
+# Uncomment if you want to log source response
+# for idx, source_response in enumerate(source_response_list):
+# logger.info(f"source_response#'{idx}'='{source_response.__dict__}'")
+
+# This will execute analyzer (Sentiment, classification etc) on source data with provided analyzer_config
+analyzer_response_list = text_analyzer.analyze_input(
+ source_response_list=source_response_list,
+ analyzer_config=analyzer_config
+)
+
+# Uncomment if you want to log analyzer response
+# for idx, an_response in enumerate(analyzer_response_list):
+# logger.info(f"analyzer_response#'{idx}'='{an_response.__dict__}'")
+
+# Analyzer output added to segmented_data
+# Uncomment to log it
+# for idx, an_response in enumerate(analyzer_response_list):
+# logger.info(f"analyzed_data#'{idx}'='{an_response.segmented_data.__dict__}'")
+
+# This will send analyzed output to configure sink ie Slack, Zendesk etc
+sink_response_list = sink.send_data(analyzer_response_list, sink_config)
+
+# Uncomment if you want to log sink response
+# for sink_response in sink_response_list:
+# if sink_response is not None:
+# logger.info(f"sink_response='{sink_response}'")
+```
+
+
+
+Step 5: Execute workflow
+Copy the code snippets from Steps 1 to 4 into a python file, for example example.py and execute the following command -
+
+```shell
+python example.py
+```
+
+
+
+## Demo
+
+We have a minimal [streamlit](https://streamlit.io/) based UI that you can use to test Obsei.
+
+![Screenshot](https://raw.githubusercontent.com/obsei/obsei-resources/master/images/obsei-ui-demo.png)
+
+### Watch UI demo video
+
+[![Introductory and demo video](https://img.youtube.com/vi/GTF-Hy96gvY/2.jpg)](https://www.youtube.com/watch?v=GTF-Hy96gvY)
+
+Check demo at [![](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue)](https://huggingface.co/spaces/obsei/obsei-demo)
+
+(**Note**: Sometimes the Streamlit demo might not work due to rate limiting, use the docker image (locally) in such cases.)
+
+To test locally, just run
+
+```
+docker run -d --name obesi-ui -p 8501:8501 obsei/obsei-ui-demo
+
+# You can find the UI at http://localhost:8501
+```
+
+**To run Obsei workflow easily using GitHub Actions (no sign ups and cloud hosting required), refer to this [repo](https://github.com/obsei/demo-workflow-action)**.
+
+## Companies/Projects using Obsei
+
+Here are some companies/projects (alphabetical order) using Obsei. To add your company/project to the list, please raise a PR or contact us via [email](contact@obsei.com).
+
+- [Oraika](https://www.oraika.com): Contextually understand customer feedback
+- [1Page](https://www.get1page.com/): Giving a better context in meetings and calls
+- [Spacepulse](http://spacepulse.in/): The operating system for spaces
+- [Superblog](https://superblog.ai/): A blazing fast alternative to WordPress and Medium
+- [Zolve](https://zolve.com/): Creating a financial world beyond borders
+- [Utilize](https://www.utilize.app/): No-code app builder for businesses with a deskless workforce
+
+## Articles
+
+
Observe app reviews from Google play store, PreProcess text via various text cleaning functions, Analyze them by performing text classification, Inform them to Pandas DataFrame and store resultant CSV to Google Drive
+
+
+
PlayStore Reviews → PreProcessing → Classification → Pandas DataFrame → CSV in Google Drive
Observe app reviews from Apple app store, PreProcess text via various text cleaning function, Analyze them by performing text classification, Inform them to Pandas DataFrame and store resultant CSV to Google Drive
+
+
+
AppStore Reviews → PreProcessing → Classification → Pandas DataFrame → CSV in Google Drive
Observe news article from Google news, PreProcess text via various text cleaning function, Analyze them via performing text classification while splitting text in small chunks and later computing final inference using given formula
+
+
+
Google News → Text Cleaner → Text Splitter → Classification → Inference Aggregator
+
+💡Tips: Handle large text classification via Obsei
+
+![](https://raw.githubusercontent.com/obsei/obsei-resources/master/gifs/Long_Text_Classification.gif)
+
+
+
+## Documentation
+
+For detailed installation instructions, usages and examples, refer to our [documentation](https://obsei.github.io/obsei/).
+
+## Support and Release Matrix
+
+
+
+
+
+
Linux
+
Mac
+
Windows
+
Remark
+
+
+
+
+
Tests
+
✅
+
✅
+
✅
+
Low Coverage as difficult to test 3rd party libs
+
+
+
PIP
+
✅
+
✅
+
✅
+
Fully Supported
+
+
+
Conda
+
❌
+
❌
+
❌
+
Not Supported
+
+
+
+
+## Discussion forum
+
+Discussion about _Obsei_ can be done at [community forum](https://github.com/obsei/obsei/discussions)
+
+## Changelogs
+
+Refer [releases](https://github.com/obsei/obsei/releases) for changelogs
+
+## Security Issue
+
+For any security issue please contact us via [email](mailto:contact@oraika.com)
+
+## Stargazers over time
+
+[![Stargazers over time](https://starchart.cc/obsei/obsei.svg)](https://starchart.cc/obsei/obsei)
+
+## Maintainers
+
+This project is being maintained by [Oraika Technologies](https://www.oraika.com). [Lalit Pagaria](https://github.com/lalitpagaria) and [Girish Patel](https://github.com/GirishPatel) are maintainers of this project.
+
+## License
+
+- Copyright holder: [Oraika Technologies](https://www.oraika.com)
+- Overall Apache 2.0 and you can read [License](https://github.com/obsei/obsei/blob/master/LICENSE) file.
+- Multiple other secondary permissive or weak copyleft licenses (LGPL, MIT, BSD etc.) for third-party components refer [Attribution](https://github.com/obsei/obsei/blob/master/ATTRIBUTION.md).
+- To make project more commercial friendly, we void third party components which have strong copyleft licenses (GPL, AGPL etc.) into the project.
+
+## Attribution
+
+This could not have been possible without these [open source softwares](https://github.com/obsei/obsei/blob/master/ATTRIBUTION.md).
+
+## Contribution
+
+First off, thank you for even considering contributing to this package, every contribution big or small is greatly appreciated.
+Please refer our [Contribution Guideline](https://github.com/obsei/obsei/blob/master/CONTRIBUTING.md) and [Code of Conduct](https://github.com/obsei/obsei/blob/master/CODE_OF_CONDUCT.md).
+
+Thanks so much to all our contributors
+
+
+
+
diff --git a/obsei_module/obsei-master/SECURITY.md b/obsei_module/obsei-master/SECURITY.md
new file mode 100644
index 0000000000000000000000000000000000000000..40ce33e3996ab24222f9c236fe167128c507ed6e
--- /dev/null
+++ b/obsei_module/obsei-master/SECURITY.md
@@ -0,0 +1,5 @@
+# Security Policy
+
+## Reporting a Vulnerability
+
+For any security issue please report it via [email](mailto:contact@oraika.com).
diff --git a/obsei_module/obsei-master/_config.yml b/obsei_module/obsei-master/_config.yml
new file mode 100644
index 0000000000000000000000000000000000000000..0b55420d431480b1c3f2d4515c45b47c2e0625df
--- /dev/null
+++ b/obsei_module/obsei-master/_config.yml
@@ -0,0 +1,9 @@
+theme: jekyll-theme-primer
+markdown: CommonMarkGhPages
+commonmark:
+ options: ["UNSAFE", "SMART", "FOOTNOTES"]
+ extensions: ["strikethrough", "autolink", "table", "tagfilter"]
+title: "Obsei: An open-source low-code AI powered automation tool"
+description: "Obsei is an open-source low-code AI powered automation tool"
+
+google_analytics: G-0E2FTKBK4T
diff --git a/obsei_module/obsei-master/_includes/head-custom-google-analytics.html b/obsei_module/obsei-master/_includes/head-custom-google-analytics.html
new file mode 100644
index 0000000000000000000000000000000000000000..360ca261d4caea0b2597b4d53b2e95605b341b86
--- /dev/null
+++ b/obsei_module/obsei-master/_includes/head-custom-google-analytics.html
@@ -0,0 +1,9 @@
+
+
+
diff --git a/obsei_module/obsei-master/binder/requirements.txt b/obsei_module/obsei-master/binder/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..c436e37c0702f46f8deb36b9deed2d3fb8491296
--- /dev/null
+++ b/obsei_module/obsei-master/binder/requirements.txt
@@ -0,0 +1,2 @@
+git+https://github.com/obsei/obsei@master#egg=obsei[all]
+trafilatura
diff --git a/obsei_module/obsei-master/example/app_store_scrapper_example.py b/obsei_module/obsei-master/example/app_store_scrapper_example.py
new file mode 100644
index 0000000000000000000000000000000000000000..fcbf9bb1a4a24527319dda4b130a41f7cb12f549
--- /dev/null
+++ b/obsei_module/obsei-master/example/app_store_scrapper_example.py
@@ -0,0 +1,41 @@
+import logging
+import sys
+from datetime import datetime, timedelta
+
+import pytz
+
+from obsei.analyzer.classification_analyzer import ClassificationAnalyzerConfig, ZeroShotClassificationAnalyzer
+from obsei.misc.utils import DATETIME_STRING_PATTERN
+from obsei.source.appstore_scrapper import (
+ AppStoreScrapperConfig,
+ AppStoreScrapperSource,
+)
+
+logger = logging.getLogger(__name__)
+logging.basicConfig(stream=sys.stdout, level=logging.INFO)
+
+since_time = datetime.utcnow().astimezone(pytz.utc) + timedelta(days=-5)
+source_config = AppStoreScrapperConfig(
+ app_url='https://apps.apple.com/us/app/gmail-email-by-google/id422689480',
+ lookup_period=since_time.strftime(DATETIME_STRING_PATTERN),
+ max_count=10,
+)
+
+source = AppStoreScrapperSource()
+
+text_analyzer = ZeroShotClassificationAnalyzer(
+ model_name_or_path="typeform/mobilebert-uncased-mnli", device="auto"
+)
+
+source_response_list = source.lookup(source_config)
+for idx, source_response in enumerate(source_response_list):
+ logger.info(f"source_response#'{idx}'='{source_response.__dict__}'")
+
+analyzer_response_list = text_analyzer.analyze_input(
+ source_response_list=source_response_list,
+ analyzer_config=ClassificationAnalyzerConfig(
+ labels=["interface", "slow", "battery"],
+ ),
+)
+for idx, an_response in enumerate(analyzer_response_list):
+ logger.info(f"analyzer_response#'{idx}'='{an_response.__dict__}'")
diff --git a/obsei_module/obsei-master/example/daily_get_example.py b/obsei_module/obsei-master/example/daily_get_example.py
new file mode 100644
index 0000000000000000000000000000000000000000..5b8209b21f1113035aa7f45a3b419e77fbac73e2
--- /dev/null
+++ b/obsei_module/obsei-master/example/daily_get_example.py
@@ -0,0 +1,77 @@
+import logging
+import os
+import sys
+from pathlib import Path
+
+from obsei.sink.dailyget_sink import DailyGetSink, DailyGetSinkConfig
+from obsei.source.twitter_source import TwitterSource, TwitterSourceConfig
+from obsei.analyzer.classification_analyzer import (
+ ClassificationAnalyzerConfig,
+ ZeroShotClassificationAnalyzer,
+)
+
+logger = logging.getLogger(__name__)
+logging.basicConfig(stream=sys.stdout, level=logging.INFO)
+
+sink_config = DailyGetSinkConfig(
+ url=os.environ["DAILYGET_URL"],
+ partner_id=os.environ["DAILYGET_PARTNER_ID"],
+ consumer_phone_number=os.environ["DAILYGET_CONSUMER_NUMBER"],
+ source_information="Twitter " + os.environ["DAILYGET_QUERY"],
+ base_payload={
+ "partnerId": os.environ["DAILYGET_PARTNER_ID"],
+ "consumerPhoneNumber": os.environ["DAILYGET_CONSUMER_NUMBER"],
+ },
+)
+
+dir_path = Path(__file__).resolve().parent.parent
+source_config = TwitterSourceConfig(
+ keywords=[os.environ["DAILYGET_QUERY"]],
+ lookup_period=os.environ["DAILYGET_LOOKUP_PERIOD"],
+ tweet_fields=[
+ "author_id",
+ "conversation_id",
+ "created_at",
+ "id",
+ "public_metrics",
+ "text",
+ ],
+ user_fields=["id", "name", "public_metrics", "username", "verified"],
+ expansions=["author_id"],
+ place_fields=None,
+ max_tweets=10,
+)
+
+source = TwitterSource()
+sink = DailyGetSink()
+text_analyzer = ZeroShotClassificationAnalyzer(
+ model_name_or_path="joeddav/bart-large-mnli-yahoo-answers",
+ # model_name_or_path="joeddav/xlm-roberta-large-xnli",
+)
+
+source_response_list = source.lookup(source_config)
+for idx, source_response in enumerate(source_response_list):
+ logger.info(f"source_response#'{idx}'='{source_response.__dict__}'")
+
+analyzer_response_list = text_analyzer.analyze_input(
+ source_response_list=source_response_list,
+ analyzer_config=ClassificationAnalyzerConfig(
+ labels=[
+ "service",
+ "delay",
+ "tracking",
+ "no response",
+ "missing items",
+ "delivery",
+ "mask",
+ ],
+ ),
+)
+for idx, an_response in enumerate(analyzer_response_list):
+ logger.info(f"analyzer_response#'{idx}'='{an_response.__dict__}'")
+
+# HTTP Sink
+sink_response_list = sink.send_data(analyzer_response_list, sink_config)
+for sink_response in sink_response_list:
+ if sink_response is not None:
+ logger.info(f"sink_response='{sink_response.__dict__}'")
diff --git a/obsei_module/obsei-master/example/elasticsearch_example.py b/obsei_module/obsei-master/example/elasticsearch_example.py
new file mode 100644
index 0000000000000000000000000000000000000000..81bc7594ab85cf66d6259d7b41c15ebf12c473fc
--- /dev/null
+++ b/obsei_module/obsei-master/example/elasticsearch_example.py
@@ -0,0 +1,69 @@
+import logging
+import sys
+from pathlib import Path
+
+from obsei.sink.elasticsearch_sink import ElasticSearchSink, ElasticSearchSinkConfig
+from obsei.source.twitter_source import TwitterSource, TwitterSourceConfig
+from obsei.analyzer.classification_analyzer import (
+ ClassificationAnalyzerConfig,
+ ZeroShotClassificationAnalyzer,
+)
+
+logger = logging.getLogger(__name__)
+logging.basicConfig(stream=sys.stdout, level=logging.INFO)
+
+dir_path = Path(__file__).resolve().parent.parent
+source_config = TwitterSourceConfig(
+ keywords="@Handle",
+ lookup_period="1h", # 1 Hour
+ tweet_fields=[
+ "author_id",
+ "conversation_id",
+ "created_at",
+ "id",
+ "public_metrics",
+ "text",
+ ],
+ user_fields=["id", "name", "public_metrics", "username", "verified"],
+ expansions=["author_id"],
+ place_fields=None,
+ max_tweets=10,
+)
+
+source = TwitterSource()
+text_analyzer = ZeroShotClassificationAnalyzer(
+ model_name_or_path="joeddav/bart-large-mnli-yahoo-answers",
+)
+
+# Start Elasticsearch server locally
+# `docker run -d --name elasticsearch -p 9200:9200 -e "discovery.type=single-node" elasticsearch:7.9.2`
+sink_config = ElasticSearchSinkConfig(
+ host="localhost",
+ port=9200,
+ index_name="test",
+)
+
+source_response_list = source.lookup(source_config)
+for idx, source_response in enumerate(source_response_list):
+ logger.info(f"source_response#'{idx}'='{source_response.__dict__}'")
+
+analyzer_response_list = text_analyzer.analyze_input(
+ source_response_list=source_response_list,
+ analyzer_config=ClassificationAnalyzerConfig(
+ labels=[
+ "service",
+ "delay",
+ "tracking",
+ "no response",
+ "missing items",
+ "delivery",
+ "mask",
+ ],
+ ),
+)
+for idx, an_response in enumerate(analyzer_response_list):
+ logger.info(f"analyzer_response#'{idx}'='{an_response.__dict__}'")
+
+sink = ElasticSearchSink()
+sink_response = sink.send_data(analyzer_response_list, sink_config)
+logger.info(f"sink_response='{sink_response}'")
diff --git a/obsei_module/obsei-master/example/email_source_example.py b/obsei_module/obsei-master/example/email_source_example.py
new file mode 100644
index 0000000000000000000000000000000000000000..414819c8c56a5de328d7c7dbe694a5d9d5f4f2ef
--- /dev/null
+++ b/obsei_module/obsei-master/example/email_source_example.py
@@ -0,0 +1,36 @@
+import logging
+import os
+import sys
+from datetime import datetime, timedelta
+
+import pytz
+
+from obsei.misc.utils import DATETIME_STRING_PATTERN
+from obsei.source.email_source import EmailConfig, EmailCredInfo, EmailSource
+
+logger = logging.getLogger(__name__)
+logging.basicConfig(stream=sys.stdout, level=logging.INFO)
+
+since_time = datetime.utcnow().astimezone(pytz.utc) + timedelta(hours=-10)
+
+# List of IMAP servers for most commonly used email providers
+# https://www.systoolsgroup.com/imap/
+# Also, if you're using a Gmail account then make sure you allow less secure apps on your account -
+# https://myaccount.google.com/lesssecureapps?pli=1
+# Also enable IMAP access -
+# https://mail.google.com/mail/u/0/#settings/fwdandpop
+source_config = EmailConfig(
+ imap_server="imap.gmail.com",
+ cred_info=EmailCredInfo(
+ # It will fetch username and password from environment variable
+ username=os.environ.get("email_username"),
+ password=os.environ.get("email_password"),
+ ),
+ lookup_period=since_time.strftime(DATETIME_STRING_PATTERN),
+)
+
+source = EmailSource()
+source_response_list = source.lookup(source_config)
+
+for source_response in source_response_list:
+ logger.info(source_response.__dict__)
diff --git a/obsei_module/obsei-master/example/facebook_example.py b/obsei_module/obsei-master/example/facebook_example.py
new file mode 100644
index 0000000000000000000000000000000000000000..207e1eb005288648bc2c67f15150496e3fd66ab9
--- /dev/null
+++ b/obsei_module/obsei-master/example/facebook_example.py
@@ -0,0 +1,19 @@
+import logging
+import sys
+
+from obsei.source.facebook_source import FacebookSource, FacebookSourceConfig
+
+logger = logging.getLogger(__name__)
+logging.basicConfig(stream=sys.stdout, level=logging.INFO)
+
+source_config = FacebookSourceConfig(page_id="110844591144719", lookup_period="2M")
+source = FacebookSource()
+source_response_list = source.lookup(source_config)
+
+logger.info("DETAILS:")
+for source_response in source_response_list:
+ logger.info(source_response)
+
+logger.info("TEXT:")
+for source_response in source_response_list:
+ logger.info(source_response.processed_text)
diff --git a/obsei_module/obsei-master/example/google_news_example.py b/obsei_module/obsei-master/example/google_news_example.py
new file mode 100644
index 0000000000000000000000000000000000000000..40277f61d7f7b37f62154dd84ed7bb2003a35e9a
--- /dev/null
+++ b/obsei_module/obsei-master/example/google_news_example.py
@@ -0,0 +1,58 @@
+from obsei.analyzer.classification_analyzer import (
+ ClassificationAnalyzerConfig,
+ ZeroShotClassificationAnalyzer,
+)
+from obsei.source.google_news_source import GoogleNewsConfig, GoogleNewsSource
+
+# Only fetch title and highlight
+source_config_without_full_text = GoogleNewsConfig(
+ query="ai",
+ max_results=150,
+ after_date='2023-12-01',
+ before_date='2023-12-31',
+)
+
+# Fetch full news article
+source_config_with_full_text = GoogleNewsConfig(
+ query="ai",
+ max_results=5,
+ fetch_article=True,
+ lookup_period="1d",
+ # proxy="http://127.0.0.1:8080"
+)
+
+source = GoogleNewsSource()
+
+analyzer_config = ClassificationAnalyzerConfig(
+ labels=["buy", "sell", "going up", "going down"],
+)
+
+text_analyzer = ZeroShotClassificationAnalyzer(
+ model_name_or_path="typeform/mobilebert-uncased-mnli", device="auto"
+)
+
+news_articles_without_full_text = source.lookup(source_config_without_full_text)
+
+news_articles_with_full_text = source.lookup(source_config_with_full_text)
+
+
+analyzer_responses_without_full_text = text_analyzer.analyze_input(
+ source_response_list=news_articles_without_full_text,
+ analyzer_config=analyzer_config,
+)
+
+analyzer_responses_with_full_text = text_analyzer.analyze_input(
+ source_response_list=news_articles_with_full_text, analyzer_config=analyzer_config
+)
+
+for article in news_articles_without_full_text:
+ print(article.__dict__)
+
+for response in analyzer_responses_without_full_text:
+ print(response.__dict__)
+
+for article in news_articles_with_full_text:
+ print(article.__dict__)
+
+for response in analyzer_responses_with_full_text:
+ print(response.__dict__)
diff --git a/obsei_module/obsei-master/example/jira_example.py b/obsei_module/obsei-master/example/jira_example.py
new file mode 100644
index 0000000000000000000000000000000000000000..00b59600731b689da9ab57bffd71285e729754e3
--- /dev/null
+++ b/obsei_module/obsei-master/example/jira_example.py
@@ -0,0 +1,77 @@
+# Jira Sink
+import logging
+import os
+import sys
+from pathlib import Path
+
+from pydantic import SecretStr
+
+from obsei.sink.jira_sink import JiraSink, JiraSinkConfig
+from obsei.source.twitter_source import (
+ TwitterCredentials,
+ TwitterSource,
+ TwitterSourceConfig,
+)
+from obsei.analyzer.classification_analyzer import (
+ ClassificationAnalyzerConfig,
+ ZeroShotClassificationAnalyzer,
+)
+
+logger = logging.getLogger(__name__)
+logging.basicConfig(stream=sys.stdout, level=logging.INFO)
+
+dir_path = Path(__file__).resolve().parent.parent
+source_config = TwitterSourceConfig(
+ keywords=["facing issue"],
+ lookup_period="1h",
+ tweet_fields=[
+ "author_id",
+ "conversation_id",
+ "created_at",
+ "id",
+ "public_metrics",
+ "text",
+ ],
+ user_fields=["id", "name", "public_metrics", "username", "verified"],
+ expansions=["author_id"],
+ place_fields=None,
+ max_tweets=10,
+ cred_info=TwitterCredentials(
+ consumer_key=SecretStr(os.environ["twitter_consumer_key"]),
+ consumer_secret=SecretStr(os.environ["twitter_consumer_secret"]),
+ ),
+)
+
+source = TwitterSource()
+
+# To start jira server locally `atlas-run-standalone --product jira`
+jira_sink_config = JiraSinkConfig(
+ url="http://localhost:2990/jira",
+ username=SecretStr("admin"),
+ password=SecretStr("admin"),
+ issue_type={"name": "Task"},
+ project={"key": "CUS"},
+)
+jira_sink = JiraSink()
+
+text_analyzer = ZeroShotClassificationAnalyzer(
+ model_name_or_path="joeddav/bart-large-mnli-yahoo-answers"
+)
+
+source_response_list = source.lookup(source_config)
+for idx, source_response in enumerate(source_response_list):
+ logger.info(f"source_response#'{idx}'='{source_response.__dict__}'")
+
+analyzer_response_list = text_analyzer.analyze_input(
+ source_response_list=source_response_list,
+ analyzer_config=ClassificationAnalyzerConfig(
+ labels=["service", "delay", "performance"],
+ ),
+)
+for idx, an_response in enumerate(analyzer_response_list):
+ logger.info(f"analyzer_response#'{idx}'='{an_response.__dict__}'")
+
+sink_response_list = jira_sink.send_data(analyzer_response_list, jira_sink_config)
+for sink_response in sink_response_list:
+ if sink_response is not None:
+ logger.info(f"sink_response='{sink_response}'")
diff --git a/obsei_module/obsei-master/example/maps_review_scrapper_example.py b/obsei_module/obsei-master/example/maps_review_scrapper_example.py
new file mode 100644
index 0000000000000000000000000000000000000000..d0633a025bb9fc15c933bd5a5a4058a0012e6392
--- /dev/null
+++ b/obsei_module/obsei-master/example/maps_review_scrapper_example.py
@@ -0,0 +1,22 @@
+import logging
+import sys
+
+from obsei.source.google_maps_reviews import (OSGoogleMapsReviewsConfig,
+ OSGoogleMapsReviewsSource)
+
+logger = logging.getLogger(__name__)
+logging.basicConfig(stream=sys.stdout, level=logging.INFO)
+
+source_config = OSGoogleMapsReviewsConfig(
+ api_key="", # Get API key from https://outscraper.com/
+ queries=[
+ "https://www.google.co.in/maps/place/Taj+Mahal/@27.1751496,78.0399535,17z/data=!4m5!3m4!1s0x39747121d702ff6d:0xdd2ae4803f767dde!8m2!3d27.1751448!4d78.0421422"
+ ],
+ number_of_reviews=3,
+)
+
+source = OSGoogleMapsReviewsSource()
+
+source_response_list = source.lookup(source_config)
+for source_response in source_response_list:
+ logger.info(source_response.__dict__)
diff --git a/obsei_module/obsei-master/example/pandas_sink_example.py b/obsei_module/obsei-master/example/pandas_sink_example.py
new file mode 100644
index 0000000000000000000000000000000000000000..b2d817ad0c369e4b81eda9f754f149c50c2875c8
--- /dev/null
+++ b/obsei_module/obsei-master/example/pandas_sink_example.py
@@ -0,0 +1,49 @@
+import logging
+import sys
+
+from pandas import DataFrame
+
+from obsei.analyzer.classification_analyzer import (
+ ClassificationAnalyzerConfig,
+ ZeroShotClassificationAnalyzer,
+)
+from obsei.sink.pandas_sink import PandasSink, PandasSinkConfig
+from obsei.source.playstore_scrapper import (
+ PlayStoreScrapperConfig,
+ PlayStoreScrapperSource,
+)
+
+
+logger = logging.getLogger(__name__)
+logging.basicConfig(stream=sys.stdout, level=logging.INFO)
+
+source_config = PlayStoreScrapperConfig(
+ countries=["us"], package_name="com.apcoaconnect", max_count=3
+)
+
+source = PlayStoreScrapperSource()
+
+text_analyzer = ZeroShotClassificationAnalyzer(
+ model_name_or_path="typeform/mobilebert-uncased-mnli", device="auto"
+)
+
+# initialize pandas sink config
+sink_config = PandasSinkConfig(dataframe=DataFrame())
+
+# initialize pandas sink
+sink = PandasSink()
+
+source_response_list = source.lookup(source_config)
+
+analyzer_response_list = text_analyzer.analyze_input(
+ source_response_list=source_response_list,
+ analyzer_config=ClassificationAnalyzerConfig(
+ labels=["no parking", "registration issue", "app issue", "payment issue"],
+ ),
+)
+
+dataframe = sink.send_data(
+ analyzer_responses=analyzer_response_list, config=sink_config
+)
+
+print(dataframe.to_csv())
diff --git a/obsei_module/obsei-master/example/pandas_source_example.py b/obsei_module/obsei-master/example/pandas_source_example.py
new file mode 100644
index 0000000000000000000000000000000000000000..5a82af3ac3cb46fd4b08de92b00b04754249511c
--- /dev/null
+++ b/obsei_module/obsei-master/example/pandas_source_example.py
@@ -0,0 +1,27 @@
+import pandas as pd
+
+from obsei.source.pandas_source import (
+ PandasSourceConfig,
+ PandasSource,
+)
+import logging
+import sys
+
+logger = logging.getLogger(__name__)
+logging.basicConfig(stream=sys.stdout, level=logging.INFO)
+
+# Initialize your Pandas DataFrame from your sources like csv, excel, sql etc
+# In following example we are reading csv which have two columns title and text
+csv_file = "https://raw.githubusercontent.com/deepset-ai/haystack/master/tutorials/small_generator_dataset.csv"
+dataframe = pd.read_csv(csv_file)
+
+source_config = PandasSourceConfig(
+ dataframe=dataframe,
+ include_columns=["title"],
+ text_columns=["text"],
+)
+source = PandasSource()
+
+source_response_list = source.lookup(source_config)
+for idx, source_response in enumerate(source_response_list):
+ logger.info(f"source_response#'{idx}'='{source_response.__dict__}'")
diff --git a/obsei_module/obsei-master/example/pii_analyzer_example.py b/obsei_module/obsei-master/example/pii_analyzer_example.py
new file mode 100644
index 0000000000000000000000000000000000000000..36ec4ff72c3f5221ccbc7c35d74897619ce69514
--- /dev/null
+++ b/obsei_module/obsei-master/example/pii_analyzer_example.py
@@ -0,0 +1,33 @@
+import logging
+import sys
+
+from obsei.payload import TextPayload
+from obsei.analyzer.pii_analyzer import (
+ PresidioEngineConfig,
+ PresidioModelConfig,
+ PresidioPIIAnalyzer,
+ PresidioPIIAnalyzerConfig,
+)
+
+logger = logging.getLogger(__name__)
+logging.basicConfig(stream=sys.stdout, level=logging.INFO)
+
+analyzer_config = PresidioPIIAnalyzerConfig(
+ analyze_only=False, return_decision_process=True
+)
+analyzer = PresidioPIIAnalyzer(
+ engine_config=PresidioEngineConfig(
+ nlp_engine_name="spacy",
+ models=[PresidioModelConfig(model_name="en_core_web_lg", lang_code="en")],
+ )
+)
+
+text_to_anonymize = "His name is Mr. Jones and his phone number is 212-555-5555"
+
+analyzer_results = analyzer.analyze_input(
+ source_response_list=[TextPayload(processed_text=text_to_anonymize)],
+ analyzer_config=analyzer_config,
+)
+
+for analyzer_result in analyzer_results:
+ logging.info(analyzer_result.to_dict())
diff --git a/obsei_module/obsei-master/example/play_store_reviews_example.py b/obsei_module/obsei-master/example/play_store_reviews_example.py
new file mode 100644
index 0000000000000000000000000000000000000000..d37669a7a4441ce69be05152c7dad7aad5edd538
--- /dev/null
+++ b/obsei_module/obsei-master/example/play_store_reviews_example.py
@@ -0,0 +1,4 @@
+# TDB
+
+# Need proper service account file to test the changes :(
+print("TBD")
diff --git a/obsei_module/obsei-master/example/playstore_scrapper_example.py b/obsei_module/obsei-master/example/playstore_scrapper_example.py
new file mode 100644
index 0000000000000000000000000000000000000000..00b1a5406f9c9785bd08262559edca0309832617
--- /dev/null
+++ b/obsei_module/obsei-master/example/playstore_scrapper_example.py
@@ -0,0 +1,40 @@
+import logging
+import sys
+
+from obsei.analyzer.classification_analyzer import (
+ ClassificationAnalyzerConfig,
+ ZeroShotClassificationAnalyzer,
+)
+
+from obsei.source.playstore_scrapper import (
+ PlayStoreScrapperConfig,
+ PlayStoreScrapperSource,
+)
+
+
+logger = logging.getLogger(__name__)
+logging.basicConfig(stream=sys.stdout, level=logging.INFO)
+
+source_config = PlayStoreScrapperConfig(
+ app_url='https://play.google.com/store/apps/details?id=com.google.android.gm&hl=en_IN&gl=US',
+ max_count=3
+)
+
+source = PlayStoreScrapperSource()
+
+text_analyzer = ZeroShotClassificationAnalyzer(
+ model_name_or_path="typeform/mobilebert-uncased-mnli", device="auto"
+)
+
+source_response_list = source.lookup(source_config)
+for idx, source_response in enumerate(source_response_list):
+ logger.info(f"source_response#'{idx}'='{source_response.__dict__}'")
+
+analyzer_response_list = text_analyzer.analyze_input(
+ source_response_list=source_response_list,
+ analyzer_config=ClassificationAnalyzerConfig(
+ labels=["interface", "slow", "battery"],
+ ),
+)
+for idx, an_response in enumerate(analyzer_response_list):
+ logger.info(f"analyzer_response#'{idx}'='{an_response.__dict__}'")
diff --git a/obsei_module/obsei-master/example/playstore_scrapper_translator_example.py b/obsei_module/obsei-master/example/playstore_scrapper_translator_example.py
new file mode 100644
index 0000000000000000000000000000000000000000..e89e09be4a17334368fcfe44039fa294728d020e
--- /dev/null
+++ b/obsei_module/obsei-master/example/playstore_scrapper_translator_example.py
@@ -0,0 +1,86 @@
+import json
+import logging
+import sys
+from datetime import datetime, timedelta
+
+import pytz
+
+from obsei.payload import TextPayload
+from obsei.analyzer.classification_analyzer import (
+ ClassificationAnalyzerConfig,
+ ZeroShotClassificationAnalyzer,
+)
+from obsei.analyzer.translation_analyzer import TranslationAnalyzer
+from obsei.misc.utils import DATETIME_STRING_PATTERN
+from obsei.source.playstore_scrapper import (
+ PlayStoreScrapperConfig,
+ PlayStoreScrapperSource,
+)
+
+
+logger = logging.getLogger(__name__)
+logging.basicConfig(stream=sys.stdout, level=logging.INFO)
+source = PlayStoreScrapperSource()
+
+
+def source_fetch():
+ since_time = datetime.utcnow().astimezone(pytz.utc) + timedelta(days=-1)
+ source_config = PlayStoreScrapperConfig(
+ countries=["us"],
+ package_name="com.color.apps.hindikeyboard.hindi.language",
+ lookup_period=since_time.strftime(
+ DATETIME_STRING_PATTERN
+ ), # todo should be optional
+ max_count=5,
+ )
+ return source.lookup(source_config)
+
+
+def translate_text(text_list):
+ translate_analyzer = TranslationAnalyzer(
+ model_name_or_path="Helsinki-NLP/opus-mt-hi-en", device="auto"
+ )
+ source_responses = [
+ TextPayload(processed_text=text.processed_text, source_name="sample")
+ for text in text_list
+ ]
+ analyzer_responses = translate_analyzer.analyze_input(
+ source_response_list=source_responses
+ )
+ return [
+ TextPayload(
+ processed_text=response.segmented_data["translated_text"],
+ source_name="translator",
+ )
+ for response in analyzer_responses
+ ]
+
+
+def classify_text(text_list):
+ text_analyzer = ZeroShotClassificationAnalyzer(
+ model_name_or_path="joeddav/bart-large-mnli-yahoo-answers", device="cpu"
+ )
+
+ return text_analyzer.analyze_input(
+ source_response_list=text_list,
+ analyzer_config=ClassificationAnalyzerConfig(
+ labels=["no parking", "registration issue", "app issue", "payment issue"],
+ ),
+ )
+
+
+def print_list(text_name, text_list):
+ for idx, text in enumerate(text_list):
+ json_response = json.dumps(text.__dict__, indent=4, sort_keys=True, default=str)
+ logger.info(f"\n{text_name}#'{idx}'='{json_response}'")
+
+
+logger.info("Started...")
+
+source_responses_list = source_fetch()
+translated_text_list = translate_text(source_responses_list)
+analyzer_response_list = classify_text(translated_text_list)
+
+print_list("source_response", source_responses_list)
+print_list("translator_response", translated_text_list)
+print_list("classifier_response", analyzer_response_list)
diff --git a/obsei_module/obsei-master/example/reddit_example.py b/obsei_module/obsei-master/example/reddit_example.py
new file mode 100644
index 0000000000000000000000000000000000000000..fdf3a8d60c0058e8cdde32914d1b984d7cbc848f
--- /dev/null
+++ b/obsei_module/obsei-master/example/reddit_example.py
@@ -0,0 +1,50 @@
+import logging
+import sys
+import time
+from datetime import datetime, timedelta
+
+import pytz
+
+from obsei.misc.utils import DATETIME_STRING_PATTERN
+from obsei.source.reddit_source import RedditConfig, RedditSource
+from obsei.workflow.store import WorkflowStore
+from obsei.workflow.workflow import Workflow, WorkflowConfig
+
+
+def print_state(id: str):
+ logger.info(f"Source State: {source.store.get_source_state(id)}")
+
+
+logger = logging.getLogger(__name__)
+logging.basicConfig(stream=sys.stdout, level=logging.INFO)
+
+since_time = datetime.utcnow().astimezone(pytz.utc) + timedelta(hours=-2)
+# Credentials will be fetched from env variable named reddit_client_id and reddit_client_secret
+source_config = RedditConfig(
+ subreddits=["wallstreetbets"],
+ lookup_period=since_time.strftime(DATETIME_STRING_PATTERN),
+)
+
+source = RedditSource(store=WorkflowStore())
+
+workflow = Workflow(
+ config=WorkflowConfig(
+ source_config=source_config,
+ ),
+)
+source.store.add_workflow(workflow)
+
+
+for i in range(1, 4):
+ print_state(workflow.id)
+ source_response_list = source.lookup(source_config, id=workflow.id)
+
+ if source_response_list is None or len(source_response_list) == 0:
+ break
+
+ for source_response in source_response_list:
+ logger.info(source_response.__dict__)
+
+ time.sleep(10)
+
+print_state(workflow.id)
diff --git a/obsei_module/obsei-master/example/reddit_scrapper_example.py b/obsei_module/obsei-master/example/reddit_scrapper_example.py
new file mode 100644
index 0000000000000000000000000000000000000000..f306024440302f61ebf8f95f29adef98df0f8aaf
--- /dev/null
+++ b/obsei_module/obsei-master/example/reddit_scrapper_example.py
@@ -0,0 +1,30 @@
+import logging
+import sys
+from datetime import datetime, timedelta
+
+import pytz
+
+from obsei.misc.utils import DATETIME_STRING_PATTERN
+from obsei.source.reddit_scrapper import RedditScrapperConfig, RedditScrapperSource
+
+
+def print_state(id: str):
+ logger.info(f"Source State: {source.store.get_source_state(id)}")
+
+
+logger = logging.getLogger(__name__)
+logging.basicConfig(stream=sys.stdout, level=logging.INFO)
+
+since_time = datetime.utcnow().astimezone(pytz.utc) + timedelta(days=-1)
+
+source_config = RedditScrapperConfig(
+ url="https://www.reddit.com/r/wallstreetbets/comments/.rss?sort=new",
+ user_agent="testscript by u/FitStatistician7378",
+ lookup_period=since_time.strftime(DATETIME_STRING_PATTERN),
+)
+
+source = RedditScrapperSource()
+
+source_response_list = source.lookup(source_config)
+for source_response in source_response_list:
+ logger.info(source_response.__dict__)
diff --git a/obsei_module/obsei-master/example/sdk.yaml b/obsei_module/obsei-master/example/sdk.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..89f5d55d3e42af1fc5fe1be71f2d92930342dde4
--- /dev/null
+++ b/obsei_module/obsei-master/example/sdk.yaml
@@ -0,0 +1,97 @@
+twitter_source:
+ _target_: obsei.source.twitter_source.TwitterSourceConfig
+ keywords:
+ - "@sample"
+ lookup_period: "1d"
+ tweet_fields:
+ - "author_id"
+ - "conversation_id"
+ - "created_at"
+ - "id"
+ - "public_metrics"
+ - "text"
+ user_fields:
+ - "id"
+ - "name"
+ - "public_metrics"
+ - "username"
+ - "verified"
+ expansions:
+ - "author_id"
+ place_fields: []
+ max_tweets: 10
+ credential:
+ _target_: obsei.source.twitter_source.TwitterCredentials
+ bearer_token: "bearer_token"
+
+play_store_source:
+ _target_: obsei.source.playstore_reviews.PlayStoreConfig
+ package_name: "com.company.package"
+ max_results: 10
+ num_retries: 2
+ cred_info:
+ _target_: obsei.source.playstore_reviews.GoogleCredInfo
+ service_cred_file: "foo/credential.json"
+ developer_key: "test_key"
+
+daily_get_sink:
+ _target_: obsei.sink.dailyget_sink.DailyGetSinkConfig
+ url: "http://localhost:8080/sample"
+ partner_id: "123456"
+ consumer_phone_number: "1234567890"
+ source_information: "Twitter @sample"
+ base_payload:
+ partnerId: daily_get_sink.partner_id
+ consumerPhoneNumber: daily_get_sink.consumer_phone_number
+
+http_sink:
+ _target_: obsei.sink.http_sink.HttpSinkConfig
+ url: "http://localhost:8080/sample"
+
+elasticsearch_sink:
+ _target_: obsei.sink.elasticsearch_sink.ElasticSearchSinkConfig
+ host: "localhost"
+ port: 9200
+ index_name: "test"
+
+jira_sink:
+ _target_: obsei.sink.jira_sink.JiraSinkConfig
+ url: "http://localhost:2990/jira"
+ username: "user"
+ password: "pass"
+ issue_type:
+ name: "Task"
+ project:
+ key: "CUS"
+
+analyzer_config:
+ _target_: obsei.analyzer.classification_analyzer.ClassificationAnalyzerConfig
+ labels:
+ - "service"
+ - "delay"
+ - "tracking"
+ - "no response"
+ add_positive_negative_labels: false
+
+analyzer:
+ _target_: obsei.analyzer.classification_analyzer.ZeroShotClassificationAnalyzer
+ model_name_or_path: "typeform/mobilebert-uncased-mnli"
+ device: "auto"
+
+slack_sink:
+ _target_: obsei.sink.SlackSink
+
+slack_sink_config:
+ _target_: obsei.sink.SlackSinkConfig
+ slack_token: 'Enter token'
+ channel_id: 'slack channel id'
+ jinja_template: |
+ ```
+ {%- for key, value in payload.items() recursive%}
+ {%- if value is mapping -%}
+ {{loop(value.items())}}
+ {%- else %}
+ {{key}}: {{value}}
+ {%- endif %}
+ {%- endfor%}
+ ```
diff --git a/obsei_module/obsei-master/example/slack_example.py b/obsei_module/obsei-master/example/slack_example.py
new file mode 100644
index 0000000000000000000000000000000000000000..4d7d8a6d1730a7ef37217940eaafd5d96b40f2b0
--- /dev/null
+++ b/obsei_module/obsei-master/example/slack_example.py
@@ -0,0 +1,66 @@
+import logging
+import os
+import sys
+
+from obsei.analyzer.dummy_analyzer import DummyAnalyzer, DummyAnalyzerConfig
+from obsei.processor import Processor
+from obsei.sink.slack_sink import SlackSink, SlackSinkConfig
+from obsei.source.playstore_scrapper import (PlayStoreScrapperConfig,
+ PlayStoreScrapperSource)
+from obsei.workflow.store import WorkflowStore
+from obsei.workflow.workflow import Workflow, WorkflowConfig
+
+
+def print_state(identifier: str):
+ logger.info(f"Source State: {source.store.get_source_state(identifier)}")
+
+
+logger = logging.getLogger(__name__)
+logging.basicConfig(stream=sys.stdout, level=logging.INFO)
+
+
+workflow_store = WorkflowStore()
+
+source_config = PlayStoreScrapperConfig(
+ app_url='https://play.google.com/store/apps/details?id=com.google.android.gm&hl=en_IN&gl=US',
+ max_count=3
+)
+
+source = PlayStoreScrapperSource(store=workflow_store)
+
+sink_config = SlackSinkConfig(
+ slack_token=os.environ["SLACK_TOKEN"],
+ channel_id="C01TUPZ23NZ",
+ jinja_template="""
+```
+ {%- for key, value in payload.items() recursive%}
+ {%- if value is mapping -%}
+{{loop(value.items())}}
+ {%- else %}
+{{key}}: {{value}}
+ {%- endif %}
+ {%- endfor%}
+```
+ """
+)
+sink = SlackSink(store=workflow_store)
+
+analyzer_config = DummyAnalyzerConfig()
+analyzer = DummyAnalyzer()
+
+workflow = Workflow(
+ config=WorkflowConfig(
+ source_config=source_config,
+ sink_config=sink_config,
+ analyzer_config=analyzer_config,
+ ),
+)
+workflow_store.add_workflow(workflow)
+
+processor = Processor(
+ analyzer=analyzer, sink=sink, source=source, analyzer_config=analyzer_config
+)
+
+processor.process(workflow=workflow)
+
+print_state(workflow.id)
diff --git a/obsei_module/obsei-master/example/twitter_source_example.py b/obsei_module/obsei-master/example/twitter_source_example.py
new file mode 100644
index 0000000000000000000000000000000000000000..fc92b681dd4b2ea248162954c0ee2d2306b110f0
--- /dev/null
+++ b/obsei_module/obsei-master/example/twitter_source_example.py
@@ -0,0 +1,98 @@
+import logging
+import sys
+
+from obsei.analyzer.classification_analyzer import ZeroShotClassificationAnalyzer, ClassificationAnalyzerConfig
+from obsei.sink.slack_sink import SlackSinkConfig, SlackSink
+from obsei.source.twitter_source import TwitterSourceConfig, TwitterSource, TwitterCredentials
+
+logger = logging.getLogger(__name__)
+logging.basicConfig(stream=sys.stdout, level=logging.INFO)
+
+twitter_cred_info = None
+
+# Enter your twitter credentials
+# Get it from https://developer.twitter.com/en/apply-for-access
+# Currently it will fetch from environment variables: twitter_bearer_token, twitter_consumer_key, twitter_consumer_secret
+# Uncomment below lines if you like to pass credentials directly instead of env variables
+
+# twitter_cred_info = TwitterCredentials(
+# bearer_token='',
+# consumer_key="",
+# consumer_secret=""
+# )
+
+source_config = TwitterSourceConfig(
+ query="bitcoin",
+ lookup_period="1h",
+ tweet_fields=[
+ "author_id",
+ "conversation_id",
+ "created_at",
+ "id",
+ "public_metrics",
+ "text",
+ ],
+ user_fields=["id", "name", "public_metrics", "username", "verified"],
+ expansions=["author_id"],
+ place_fields=None,
+ max_tweets=10,
+ cred_info=twitter_cred_info or None
+)
+
+source = TwitterSource()
+
+
+sink_config = SlackSinkConfig(
+ # Uncomment below lines if you like to pass credentials directly instead of env variables
+ # slack_token="SLACK_TOKEN",
+ # channel_id="CHANNEL_ID",
+ jinja_template="""
+:bell: Hi there!, a new `<{{payload['meta']['tweet_url']}}|tweet>` of interest is found by *Obsei*
+>📝 Content:
+```{{payload['meta']['text']}}```
+>ℹ️Information:
+```
+User Name: {{payload['meta']['author_info']['name']}} ({{payload['meta']['author_info']['user_url']}})
+Tweet Metrics: Retweets={{payload['meta']['public_metrics']['retweet_count']}}, Likes={{payload['meta']['public_metrics']['like_count']}}
+Author Metrics: Verified={{payload['meta']['author_info']['verified']}}, Followers={{payload['meta']['author_info']['public_metrics']['followers_count']}}
+```
+>🧠 AI Engine Data:
+```
+ {%- for key, value in payload['segmented_data']['classifier_data'].items() recursive%}
+ {%- if value is mapping -%}
+{{loop(value.items())}}
+ {%- else %}
+{{key}}: {{value}}
+ {%- endif %}
+ {%- endfor%}
+```
+ """
+)
+sink = SlackSink()
+
+text_analyzer = ZeroShotClassificationAnalyzer(
+ model_name_or_path="typeform/mobilebert-uncased-mnli", device="auto"
+)
+
+analyzer_config = ClassificationAnalyzerConfig(
+ labels=["going up", "going down"],
+ add_positive_negative_labels=False,
+)
+
+source_response_list = source.lookup(source_config)
+for idx, source_response in enumerate(source_response_list):
+ logger.info(f"source_response#'{idx}'='{source_response.__dict__}'")
+
+analyzer_response_list = text_analyzer.analyze_input(
+ source_response_list=source_response_list,
+ analyzer_config=analyzer_config,
+)
+
+for idx, an_response in enumerate(analyzer_response_list):
+ logger.info(f"analyzer_response#'{idx}'='{an_response.__dict__}'")
+
+sink_response_list = sink.send_data(
+ analyzer_responses=analyzer_response_list, config=sink_config, id=id
+)
+for idx, sink_response in enumerate(sink_response_list):
+ logger.info(f"source_response#'{idx}'='{sink_response.__dict__}'")
diff --git a/obsei_module/obsei-master/example/web_crawler_example.py b/obsei_module/obsei-master/example/web_crawler_example.py
new file mode 100644
index 0000000000000000000000000000000000000000..51e5d68ba7ea902781110b1cdf327cba25630383
--- /dev/null
+++ b/obsei_module/obsei-master/example/web_crawler_example.py
@@ -0,0 +1,43 @@
+# Fetch full news article
+from obsei.source.website_crawler_source import (
+ TrafilaturaCrawlerConfig,
+ TrafilaturaCrawlerSource,
+)
+
+
+def print_list(response_list):
+ for response in response_list:
+ print(response.__dict__)
+
+
+# Single URL
+source_config = TrafilaturaCrawlerConfig(urls=["https://obsei.github.io/obsei/"])
+
+source = TrafilaturaCrawlerSource()
+
+source_response_list = source.lookup(source_config)
+print_list(source_response_list)
+
+
+# RSS feed (Note it will take lot of time)
+source_config = TrafilaturaCrawlerConfig(
+ urls=["https://news.google.com/rss/search?q=bitcoin&hl=en&gl=US&ceid=US:en"],
+ is_feed=True,
+)
+
+source = TrafilaturaCrawlerSource()
+
+source_response_list = source.lookup(source_config)
+print_list(source_response_list)
+
+
+# Full website (Note it will take lot of time)
+source_config = TrafilaturaCrawlerConfig(
+ urls=["https://haystack.deepset.ai/"],
+ is_sitemap=True,
+)
+
+source = TrafilaturaCrawlerSource()
+
+source_response_list = source.lookup(source_config)
+print_list(source_response_list)
diff --git a/obsei_module/obsei-master/example/with_sdk_config_file.py b/obsei_module/obsei-master/example/with_sdk_config_file.py
new file mode 100644
index 0000000000000000000000000000000000000000..37da8373fa032c2ceecab54de181ed9e9190ae04
--- /dev/null
+++ b/obsei_module/obsei-master/example/with_sdk_config_file.py
@@ -0,0 +1,28 @@
+import logging
+import sys
+
+from obsei.configuration import ObseiConfiguration
+
+logger = logging.getLogger(__name__)
+logging.basicConfig(stream=sys.stdout, level=logging.INFO)
+
+obsei_configuration = ObseiConfiguration(
+ config_path="../example",
+ config_filename="sdk.yaml",
+)
+
+text_analyzer = obsei_configuration.initialize_instance("analyzer")
+analyzer_config = obsei_configuration.initialize_instance("analyzer_config")
+slack_source_config = obsei_configuration.initialize_instance("slack_sink_config")
+slack_sink = obsei_configuration.initialize_instance("slack_sink")
+
+play_store_source_config = obsei_configuration.initialize_instance("play_store_source")
+twitter_source_config = obsei_configuration.initialize_instance("twitter_source")
+http_sink_config = obsei_configuration.initialize_instance("http_sink")
+daily_get_sink_config = obsei_configuration.initialize_instance("daily_get_sink")
+# docker run -d --name elasticsearch -p 9200:9200 -e "discovery.type=single-node" elasticsearch:7.9.2
+elasticsearch_sink_config = obsei_configuration.initialize_instance(
+ "elasticsearch_sink"
+)
+# Start jira server locally `atlas-run-standalone --product jira`
+jira_sink_config = obsei_configuration.initialize_instance("jira_sink")
diff --git a/obsei_module/obsei-master/example/with_state_example.py b/obsei_module/obsei-master/example/with_state_example.py
new file mode 100644
index 0000000000000000000000000000000000000000..40ac7a0bc68a781907fd720861f035f557c2b67f
--- /dev/null
+++ b/obsei_module/obsei-master/example/with_state_example.py
@@ -0,0 +1,60 @@
+import logging
+import sys
+import time
+
+from obsei.workflow.store import WorkflowStore
+from obsei.source.twitter_source import TwitterSource, TwitterSourceConfig
+from obsei.workflow.workflow import Workflow, WorkflowConfig
+
+logger = logging.getLogger(__name__)
+logging.basicConfig(stream=sys.stdout, level=logging.INFO)
+
+# Create workflow store instance, by default it will use SQLite to store state data
+store = WorkflowStore()
+
+# Pass store reference to observer, so it can use it to store state data
+source = TwitterSource(store=store)
+
+
+def print_state(id: str):
+ logger.info(f"Source State: {source.store.get_source_state(id)}")
+
+
+source_config = TwitterSourceConfig(
+ keywords=["india"],
+ lookup_period="2m",
+ tweet_fields=[
+ "author_id",
+ "conversation_id",
+ "created_at",
+ "id",
+ "public_metrics",
+ "text",
+ ],
+ user_fields=["id", "name", "public_metrics", "username", "verified"],
+ expansions=["author_id"],
+ place_fields=None,
+ max_tweets=10,
+)
+
+# Create instance of workflow, adding observer config to it, it will autgenerate unique workflow id
+workflow = Workflow(
+ config=WorkflowConfig(
+ source_config=source_config,
+ ),
+)
+# Insert workflow config to DB store
+store.add_workflow(workflow)
+
+for i in range(1, 4):
+ print_state(workflow.id)
+ # Now always pass workflow id to lookup function
+ # Observer will fetch old data from DB suing this id and later store new updated state data against this id to DB
+ source_response_list = source.lookup(source_config, id=workflow.id)
+
+ if source_response_list is None or len(source_response_list) == 0:
+ break
+
+ time.sleep(180)
+
+print_state(workflow.id)
diff --git a/obsei_module/obsei-master/example/youtube_scrapper_example.py b/obsei_module/obsei-master/example/youtube_scrapper_example.py
new file mode 100644
index 0000000000000000000000000000000000000000..6bded3d4884a7f7523eea96e9275e648ebd9ad2f
--- /dev/null
+++ b/obsei_module/obsei-master/example/youtube_scrapper_example.py
@@ -0,0 +1,36 @@
+import logging
+import sys
+
+from obsei.analyzer.classification_analyzer import (
+ ClassificationAnalyzerConfig, ZeroShotClassificationAnalyzer)
+from obsei.source.youtube_scrapper import (YoutubeScrapperConfig,
+ YoutubeScrapperSource)
+
+logger = logging.getLogger(__name__)
+logging.basicConfig(stream=sys.stdout, level=logging.INFO)
+
+source_config = YoutubeScrapperConfig(
+ video_url="https://www.youtube.com/watch?v=uZfns0JIlFk",
+ fetch_replies=True,
+ max_comments=10,
+ lookup_period="1Y",
+)
+
+source = YoutubeScrapperSource()
+
+source_response_list = source.lookup(source_config)
+for idx, source_response in enumerate(source_response_list):
+ logger.info(f"source_response#'{idx}'='{source_response.__dict__}'")
+
+text_analyzer = ZeroShotClassificationAnalyzer(
+ model_name_or_path="typeform/mobilebert-uncased-mnli", device="auto"
+)
+
+analyzer_response_list = text_analyzer.analyze_input(
+ source_response_list=source_response_list,
+ analyzer_config=ClassificationAnalyzerConfig(
+ labels=["interesting", "enquiring"],
+ ),
+)
+for idx, an_response in enumerate(analyzer_response_list):
+ logger.info(f"analyzer_response#'{idx}'='{an_response.__dict__}'")
diff --git a/obsei_module/obsei-master/images/Obsei-flow-diagram.png b/obsei_module/obsei-master/images/Obsei-flow-diagram.png
new file mode 100644
index 0000000000000000000000000000000000000000..e25e5fb52066f679c4ee7433cdcb59c19c4b4bd3
Binary files /dev/null and b/obsei_module/obsei-master/images/Obsei-flow-diagram.png differ
diff --git a/obsei_module/obsei-master/images/Obsei-future-concept.png b/obsei_module/obsei-master/images/Obsei-future-concept.png
new file mode 100644
index 0000000000000000000000000000000000000000..84d97eb97c5cc36b2d471789bad8edc23f2ad48d
Binary files /dev/null and b/obsei_module/obsei-master/images/Obsei-future-concept.png differ
diff --git a/obsei_module/obsei-master/images/jira_screenshot.png b/obsei_module/obsei-master/images/jira_screenshot.png
new file mode 100644
index 0000000000000000000000000000000000000000..d14e391487bdf46c5b279dd0f60c6de16302e901
Binary files /dev/null and b/obsei_module/obsei-master/images/jira_screenshot.png differ
diff --git a/obsei_module/obsei-master/images/logos/Slack_join.svg b/obsei_module/obsei-master/images/logos/Slack_join.svg
new file mode 100644
index 0000000000000000000000000000000000000000..1224dfdaba2b72dbfbc3f86a9e9a29aab4dc96b2
--- /dev/null
+++ b/obsei_module/obsei-master/images/logos/Slack_join.svg
@@ -0,0 +1,49 @@
+
+
+
diff --git a/obsei_module/obsei-master/images/logos/appstore.png b/obsei_module/obsei-master/images/logos/appstore.png
new file mode 100644
index 0000000000000000000000000000000000000000..a88786b5cfc6d42af039a0cf0ca15ef0f4ae378f
Binary files /dev/null and b/obsei_module/obsei-master/images/logos/appstore.png differ
diff --git a/obsei_module/obsei-master/images/logos/classification.png b/obsei_module/obsei-master/images/logos/classification.png
new file mode 100644
index 0000000000000000000000000000000000000000..c272cc21dd36d86e3e0fedb387fafe758af59bb1
Binary files /dev/null and b/obsei_module/obsei-master/images/logos/classification.png differ
diff --git a/obsei_module/obsei-master/images/logos/dummy.png b/obsei_module/obsei-master/images/logos/dummy.png
new file mode 100644
index 0000000000000000000000000000000000000000..65a21ae2882d0e26c88e1694425155a81e418507
Binary files /dev/null and b/obsei_module/obsei-master/images/logos/dummy.png differ
diff --git a/obsei_module/obsei-master/images/logos/elastic.png b/obsei_module/obsei-master/images/logos/elastic.png
new file mode 100644
index 0000000000000000000000000000000000000000..47e53a58326b6bf4d5bc8a9a48811c360311a7dd
Binary files /dev/null and b/obsei_module/obsei-master/images/logos/elastic.png differ
diff --git a/obsei_module/obsei-master/images/logos/facebook.png b/obsei_module/obsei-master/images/logos/facebook.png
new file mode 100644
index 0000000000000000000000000000000000000000..b4374d2b0528a0ec9f617a5c7ee33722e98a7d19
Binary files /dev/null and b/obsei_module/obsei-master/images/logos/facebook.png differ
diff --git a/obsei_module/obsei-master/images/logos/gmail.png b/obsei_module/obsei-master/images/logos/gmail.png
new file mode 100644
index 0000000000000000000000000000000000000000..357c439d8d8d7f884a3eeaaf1e1412d73ff5a72c
Binary files /dev/null and b/obsei_module/obsei-master/images/logos/gmail.png differ
diff --git a/obsei_module/obsei-master/images/logos/googlenews.png b/obsei_module/obsei-master/images/logos/googlenews.png
new file mode 100644
index 0000000000000000000000000000000000000000..a7cf45acdb3bb1303a7e2b7ac8f2db9de11da558
Binary files /dev/null and b/obsei_module/obsei-master/images/logos/googlenews.png differ
diff --git a/obsei_module/obsei-master/images/logos/http_api.png b/obsei_module/obsei-master/images/logos/http_api.png
new file mode 100644
index 0000000000000000000000000000000000000000..8bc5c9ae4f47022088572a1753711bf5fd669948
Binary files /dev/null and b/obsei_module/obsei-master/images/logos/http_api.png differ
diff --git a/obsei_module/obsei-master/images/logos/jira.png b/obsei_module/obsei-master/images/logos/jira.png
new file mode 100644
index 0000000000000000000000000000000000000000..d92cf6f845594ace475c07c07cbbcfac0907a660
Binary files /dev/null and b/obsei_module/obsei-master/images/logos/jira.png differ
diff --git a/obsei_module/obsei-master/images/logos/logger.png b/obsei_module/obsei-master/images/logos/logger.png
new file mode 100644
index 0000000000000000000000000000000000000000..34a68ee5a5e73c2b9963ff9284482dcf4effe0f3
Binary files /dev/null and b/obsei_module/obsei-master/images/logos/logger.png differ
diff --git a/obsei_module/obsei-master/images/logos/ner.png b/obsei_module/obsei-master/images/logos/ner.png
new file mode 100644
index 0000000000000000000000000000000000000000..06297eabfb3d5d589943decf931e2ac096ee38d3
Binary files /dev/null and b/obsei_module/obsei-master/images/logos/ner.png differ
diff --git a/obsei_module/obsei-master/images/logos/obsei_200x200.png b/obsei_module/obsei-master/images/logos/obsei_200x200.png
new file mode 100644
index 0000000000000000000000000000000000000000..cb4cb25373412834d3384c0308c8039667111876
Binary files /dev/null and b/obsei_module/obsei-master/images/logos/obsei_200x200.png differ
diff --git a/obsei_module/obsei-master/images/logos/pandas.svg b/obsei_module/obsei-master/images/logos/pandas.svg
new file mode 100644
index 0000000000000000000000000000000000000000..1451f57de198e7283f900a2538212c3ee27458f9
--- /dev/null
+++ b/obsei_module/obsei-master/images/logos/pandas.svg
@@ -0,0 +1,111 @@
+
+
diff --git a/obsei_module/obsei-master/images/logos/pii.png b/obsei_module/obsei-master/images/logos/pii.png
new file mode 100644
index 0000000000000000000000000000000000000000..13a6826f6f8aed02e7b1e89a2a9fac1ff3510481
Binary files /dev/null and b/obsei_module/obsei-master/images/logos/pii.png differ
diff --git a/obsei_module/obsei-master/images/logos/playstore.png b/obsei_module/obsei-master/images/logos/playstore.png
new file mode 100644
index 0000000000000000000000000000000000000000..c054cd04bb47e26fdba2d5b66071a63317182f36
Binary files /dev/null and b/obsei_module/obsei-master/images/logos/playstore.png differ
diff --git a/obsei_module/obsei-master/images/logos/reddit.png b/obsei_module/obsei-master/images/logos/reddit.png
new file mode 100644
index 0000000000000000000000000000000000000000..695eff14557b7ae25f594febd4cd562013fb9c5a
Binary files /dev/null and b/obsei_module/obsei-master/images/logos/reddit.png differ
diff --git a/obsei_module/obsei-master/images/logos/sentiment.png b/obsei_module/obsei-master/images/logos/sentiment.png
new file mode 100644
index 0000000000000000000000000000000000000000..632d49b471815a10ad16e6bdbe0db53a549b6076
Binary files /dev/null and b/obsei_module/obsei-master/images/logos/sentiment.png differ
diff --git a/obsei_module/obsei-master/images/logos/slack.svg b/obsei_module/obsei-master/images/logos/slack.svg
new file mode 100644
index 0000000000000000000000000000000000000000..c37dc5eb49e3ef638f9dd6f4cf9ab345db8c141d
--- /dev/null
+++ b/obsei_module/obsei-master/images/logos/slack.svg
@@ -0,0 +1,33 @@
+
+
+
diff --git a/obsei_module/obsei-master/images/logos/translator.png b/obsei_module/obsei-master/images/logos/translator.png
new file mode 100644
index 0000000000000000000000000000000000000000..2aa0174fa5f132d6474e051e9be503c105f03719
Binary files /dev/null and b/obsei_module/obsei-master/images/logos/translator.png differ
diff --git a/obsei_module/obsei-master/images/logos/twitter.png b/obsei_module/obsei-master/images/logos/twitter.png
new file mode 100644
index 0000000000000000000000000000000000000000..4e72f8cb100d33850ff0141e009afc30cd0c79b0
Binary files /dev/null and b/obsei_module/obsei-master/images/logos/twitter.png differ
diff --git a/obsei_module/obsei-master/images/logos/webcrawler.png b/obsei_module/obsei-master/images/logos/webcrawler.png
new file mode 100644
index 0000000000000000000000000000000000000000..0fd7f3623df18ed0493e08d526167e7cd1e1ee7e
Binary files /dev/null and b/obsei_module/obsei-master/images/logos/webcrawler.png differ
diff --git a/obsei_module/obsei-master/images/logos/zendesk.png b/obsei_module/obsei-master/images/logos/zendesk.png
new file mode 100644
index 0000000000000000000000000000000000000000..78bfb22e62f72e0a034b8e14a8964d9e5c185375
Binary files /dev/null and b/obsei_module/obsei-master/images/logos/zendesk.png differ
diff --git a/obsei_module/obsei-master/images/obsei-flyer.png b/obsei_module/obsei-master/images/obsei-flyer.png
new file mode 100644
index 0000000000000000000000000000000000000000..e94f831d50cceada70f07a5e1f6814201f7fa76b
Binary files /dev/null and b/obsei_module/obsei-master/images/obsei-flyer.png differ
diff --git a/obsei_module/obsei-master/images/obsei-ui-demo.png b/obsei_module/obsei-master/images/obsei-ui-demo.png
new file mode 100644
index 0000000000000000000000000000000000000000..c0ce9bc92418c70a4e5b7210f7ab89024820dc17
Binary files /dev/null and b/obsei_module/obsei-master/images/obsei-ui-demo.png differ
diff --git a/obsei_module/obsei-master/images/obsei_flow.gif b/obsei_module/obsei-master/images/obsei_flow.gif
new file mode 100644
index 0000000000000000000000000000000000000000..a6538b34996b6092ce3c978b1daf8ff9ad4da683
--- /dev/null
+++ b/obsei_module/obsei-master/images/obsei_flow.gif
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2bb0b0b15bac52084145aea23f9b47b207853ce9c45d4c355ccadffadc129bb9
+size 6226733
diff --git a/obsei_module/obsei-master/mypy.ini b/obsei_module/obsei-master/mypy.ini
new file mode 100644
index 0000000000000000000000000000000000000000..976ba0294638950e865be3934cbeee3b6305ffd6
--- /dev/null
+++ b/obsei_module/obsei-master/mypy.ini
@@ -0,0 +1,2 @@
+[mypy]
+ignore_missing_imports = True
diff --git a/obsei_module/obsei-master/obsei/__init__.py b/obsei_module/obsei-master/obsei/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..845724ca59968b71689a82b2b48d7bd93f142a0c
--- /dev/null
+++ b/obsei_module/obsei-master/obsei/__init__.py
@@ -0,0 +1,19 @@
+import logging
+
+from obsei._version import __version__
+
+logging.basicConfig(
+ format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+ datefmt="%m/%d/%Y %H:%M:%S",
+ level=logging.INFO,
+)
+
+init_logger: logging.Logger = logging.getLogger(__name__)
+
+installation_message: str = """
+By default `pip install obsei` will only install core dependencies.
+To install all required dependencies use `pip install obsei[all]`.
+Refer https://obsei.com/#install-obsei for more options.
+"""
+
+init_logger.warning(installation_message)
diff --git a/obsei_module/obsei-master/obsei/_version.py b/obsei_module/obsei-master/obsei/_version.py
new file mode 100644
index 0000000000000000000000000000000000000000..6561790f155f6bfd436e5b19b2f0a1e7f20c0259
--- /dev/null
+++ b/obsei_module/obsei-master/obsei/_version.py
@@ -0,0 +1 @@
+__version__ = "0.0.15"
diff --git a/obsei_module/obsei-master/obsei/analyzer/__init__.py b/obsei_module/obsei-master/obsei/analyzer/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/obsei_module/obsei-master/obsei/analyzer/base_analyzer.py b/obsei_module/obsei-master/obsei/analyzer/base_analyzer.py
new file mode 100644
index 0000000000000000000000000000000000000000..aae074243dd431a9f48e7e253f627dc07ecaf7f2
--- /dev/null
+++ b/obsei_module/obsei-master/obsei/analyzer/base_analyzer.py
@@ -0,0 +1,82 @@
+from abc import abstractmethod
+from typing import Any, Generator, List, Optional
+
+from pydantic import Field, PrivateAttr
+from pydantic_settings import BaseSettings
+
+from obsei.misc import gpu_util
+from obsei.payload import TextPayload
+from obsei.postprocessor.inference_aggregator import (
+ InferenceAggregator,
+ InferenceAggregatorConfig,
+)
+from obsei.preprocessor.text_splitter import TextSplitter, TextSplitterConfig
+from obsei.workflow.base_store import BaseStore
+
+MAX_LENGTH: int = 510
+DEFAULT_BATCH_SIZE_GPU: int = 64
+DEFAULT_BATCH_SIZE_CPU: int = 4
+
+
+class BaseAnalyzerConfig(BaseSettings):
+ TYPE: str = "Base"
+ use_splitter_and_aggregator: Optional[bool] = False
+ splitter_config: Optional[TextSplitterConfig] = None
+ aggregator_config: Optional[InferenceAggregatorConfig] = None
+
+ def __init__(self, **data: Any):
+ super().__init__(**data)
+
+ if self.use_splitter_and_aggregator and not self.splitter_config and not self.aggregator_config:
+ raise AttributeError("Need splitter_config and aggregator_config if enabling use_splitter_and_aggregator "
+ "option")
+
+ class Config:
+ arbitrary_types_allowed = True
+
+
+class BaseAnalyzer(BaseSettings):
+ _device_id: int = PrivateAttr()
+ TYPE: str = "Base"
+ store: Optional[BaseStore] = None
+ device: str = "auto"
+ batch_size: int = -1
+ splitter: TextSplitter = Field(default=TextSplitter())
+ aggregator: InferenceAggregator = Field(default=InferenceAggregator())
+
+ """
+ auto: choose gpu if present else use cpu
+ cpu: use cpu
+ cuda:{id} - cuda device id
+ """
+
+ def __init__(self, **data: Any):
+ super().__init__(**data)
+
+ self._device_id = gpu_util.get_device_id(self.device)
+ if self.batch_size < 0:
+ self.batch_size = (
+ DEFAULT_BATCH_SIZE_CPU
+ if self._device_id == 0
+ else DEFAULT_BATCH_SIZE_GPU
+ )
+
+ @abstractmethod
+ def analyze_input(
+ self,
+ source_response_list: List[TextPayload],
+ analyzer_config: Optional[BaseAnalyzerConfig] = None,
+ **kwargs: Any,
+ ) -> List[TextPayload]:
+ pass
+
+ @staticmethod
+ def batchify(
+ payload_list: List[TextPayload],
+ batch_size: int,
+ ) -> Generator[List[TextPayload], None, None]:
+ for index in range(0, len(payload_list), batch_size):
+ yield payload_list[index : index + batch_size]
+
+ class Config:
+ arbitrary_types_allowed = True
diff --git a/obsei_module/obsei-master/obsei/analyzer/classification_analyzer.py b/obsei_module/obsei-master/obsei/analyzer/classification_analyzer.py
new file mode 100644
index 0000000000000000000000000000000000000000..0f51a48313772ee5883fc1374a114eb977346139
--- /dev/null
+++ b/obsei_module/obsei-master/obsei/analyzer/classification_analyzer.py
@@ -0,0 +1,204 @@
+import logging
+from typing import Any, Dict, List, Optional
+
+from pydantic import Field, PrivateAttr
+from transformers import Pipeline, pipeline
+
+from obsei.analyzer.base_analyzer import (
+ BaseAnalyzer,
+ BaseAnalyzerConfig,
+ MAX_LENGTH,
+)
+from obsei.payload import TextPayload
+from obsei.postprocessor.inference_aggregator import InferenceAggregatorConfig
+from obsei.postprocessor.inference_aggregator_function import ClassificationAverageScore
+
+logger = logging.getLogger(__name__)
+
+
+class ClassificationAnalyzerConfig(BaseAnalyzerConfig):
+ TYPE: str = "Classification"
+ labels: Optional[List[str]] = None
+ label_map: Optional[Dict[str, str]] = None
+ multi_class_classification: bool = True
+ add_positive_negative_labels: bool = True
+ aggregator_config: InferenceAggregatorConfig = Field(
+ InferenceAggregatorConfig(aggregate_function=ClassificationAverageScore())
+ )
+
+ def __init__(self, **data: Any):
+ super().__init__(**data)
+
+ if self.labels is None:
+ self.multi_class_classification = False
+ self.add_positive_negative_labels = False
+
+
+class TextClassificationAnalyzer(BaseAnalyzer):
+ TYPE: str = "Classification"
+ pipeline_name: str = "text-classification"
+ _pipeline: Pipeline = PrivateAttr()
+ _max_length: int = PrivateAttr()
+ model_name_or_path: str
+
+ def __init__(self, **data: Any):
+ super().__init__(**data)
+ self._pipeline = pipeline(
+ self.pipeline_name,
+ model=self.model_name_or_path,
+ device=self._device_id,
+ )
+
+ if hasattr(self._pipeline.model.config, "max_position_embeddings"):
+ self._max_length = self._pipeline.model.config.max_position_embeddings
+ else:
+ self._max_length = MAX_LENGTH
+
+ def prediction_from_model(
+ self,
+ texts: List[str],
+ analyzer_config: Optional[ClassificationAnalyzerConfig] = None,
+ ) -> List[Dict[str, Any]]:
+ prediction = self._pipeline(texts)
+ predictions = prediction if isinstance(prediction, list) else [prediction]
+ label_map = analyzer_config.label_map if analyzer_config is not None else {}
+ label_map = label_map or {}
+ return [
+ {
+ label_map.get(prediction["label"], prediction["label"]): prediction["score"]
+ } for prediction in predictions
+ ]
+
+ def analyze_input( # type: ignore[override]
+ self,
+ source_response_list: List[TextPayload],
+ analyzer_config: Optional[ClassificationAnalyzerConfig] = None,
+ **kwargs: Any,
+ ) -> List[TextPayload]:
+ analyzer_output: List[TextPayload] = []
+
+ if (
+ analyzer_config is not None
+ and analyzer_config.use_splitter_and_aggregator
+ and analyzer_config.splitter_config
+ ):
+ source_response_list = self.splitter.preprocess_input(
+ source_response_list,
+ config=analyzer_config.splitter_config,
+ )
+
+ for batch_responses in self.batchify(source_response_list, self.batch_size):
+ texts = [
+ source_response.processed_text[: self._max_length]
+ for source_response in batch_responses
+ ]
+
+ batch_predictions = self.prediction_from_model(texts=texts, analyzer_config=analyzer_config)
+
+ for score_dict, source_response in zip(batch_predictions, batch_responses):
+ segmented_data = {
+ "classifier_data": score_dict
+ }
+
+ if source_response.segmented_data:
+ segmented_data = {
+ **segmented_data,
+ **source_response.segmented_data,
+ }
+
+ analyzer_output.append(
+ TextPayload(
+ processed_text=source_response.processed_text,
+ meta=source_response.meta,
+ segmented_data=segmented_data,
+ source_name=source_response.source_name,
+ )
+ )
+
+ if (
+ analyzer_config is not None
+ and analyzer_config.use_splitter_and_aggregator
+ and analyzer_config.aggregator_config
+ ):
+ analyzer_output = self.aggregator.postprocess_input(
+ input_list=analyzer_output,
+ config=analyzer_config.aggregator_config,
+ )
+
+ return analyzer_output
+
+
+class ZeroShotClassificationAnalyzer(TextClassificationAnalyzer):
+ pipeline_name: str = "zero-shot-classification"
+
+ def prediction_from_model(
+ self,
+ texts: List[str],
+ analyzer_config: Optional[ClassificationAnalyzerConfig] = None,
+ ) -> List[Dict[str, Any]]:
+ if analyzer_config is None:
+ raise ValueError("analyzer_config can't be None")
+
+ labels = analyzer_config.labels or []
+ if analyzer_config.add_positive_negative_labels:
+ if "positive" not in labels:
+ labels.append("positive")
+ if "negative" not in labels:
+ labels.append("negative")
+
+ if len(labels) == 0:
+ raise ValueError("`labels` can't be empty or `add_positive_negative_labels` should be False")
+
+ prediction = self._pipeline(
+ texts, candidate_labels=labels, multi_label=analyzer_config.multi_class_classification
+ )
+ predictions = prediction if isinstance(prediction, list) else [prediction]
+
+ return [dict(zip(prediction["labels"], prediction["scores"])) for prediction in predictions]
+
+ def analyze_input( # type: ignore[override]
+ self,
+ source_response_list: List[TextPayload],
+ analyzer_config: Optional[ClassificationAnalyzerConfig] = None,
+ **kwargs: Any,
+ ) -> List[TextPayload]:
+ if analyzer_config is None:
+ raise ValueError("analyzer_config can't be None")
+
+ return super().analyze_input(
+ source_response_list=source_response_list,
+ analyzer_config=analyzer_config,
+ **kwargs
+ )
+
+
+# Step 1: Define your configuration (labels, etc.)
+analyzer_config = ClassificationAnalyzerConfig(
+ labels=["Sports", "Politics", "Technology", "Entertainment"], # Example labels
+ multi_class_classification=False,
+ add_positive_negative_labels=False
+)
+
+# Step 2: Initialize the ZeroShotClassificationAnalyzer
+analyzer = ZeroShotClassificationAnalyzer(
+ model_name_or_path="facebook/bart-large-mnli", # Using a pre-trained zero-shot classification model
+ device=-1 # Assuming you have a GPU, use device=-1 for CPU
+)
+
+# Step 3: Prepare the input text (as TextPayload objects)
+texts = [
+ "The new iPhone has been released and it's taking the tech world by storm.",
+ "The latest political debate had strong views on the economy.",
+ "The football match between Barcelona and Madrid ended in a draw."
+]
+
+# Create TextPayloads from the texts
+source_responses = [TextPayload(processed_text=text) for text in texts]
+
+# Step 4: Run the analysis
+results = analyzer.analyze_input(source_response_list=source_responses, analyzer_config=analyzer_config)
+
+# Step 5: Output the results
+for result in results:
+ print(f"Text: {result.processed_text}")
+ print(f"Classification Scores: {result.segmented_data['classifier_data']}")
\ No newline at end of file
diff --git a/obsei_module/obsei-master/obsei/analyzer/dummy_analyzer.py b/obsei_module/obsei-master/obsei/analyzer/dummy_analyzer.py
new file mode 100644
index 0000000000000000000000000000000000000000..1119107061ce80db049c18acc7a43c4ef34bf09d
--- /dev/null
+++ b/obsei_module/obsei-master/obsei/analyzer/dummy_analyzer.py
@@ -0,0 +1,46 @@
+from typing import Any, List, Optional
+
+from obsei.analyzer.base_analyzer import (
+ BaseAnalyzer,
+ BaseAnalyzerConfig,
+)
+from obsei.payload import TextPayload
+
+
+class DummyAnalyzerConfig(BaseAnalyzerConfig):
+ TYPE: str = "Dummy"
+ dummy_data: Optional[Any] = None
+
+ def __init__(self, **data: Any):
+ super().__init__(**data)
+
+
+class DummyAnalyzer(BaseAnalyzer):
+ def analyze_input( # type: ignore[override]
+ self,
+ source_response_list: List[TextPayload],
+ analyzer_config: Optional[DummyAnalyzerConfig] = None,
+ **kwargs: Any,
+ ) -> List[TextPayload]:
+ responses = []
+ for source_response in source_response_list:
+
+ segmented_data = {
+ "dummy_data": None
+ if not analyzer_config
+ else analyzer_config.dummy_data
+ }
+
+ if source_response.segmented_data:
+ segmented_data = {**segmented_data, **source_response.segmented_data}
+
+ responses.append(
+ TextPayload(
+ processed_text=source_response.processed_text,
+ meta=source_response.meta,
+ source_name=source_response.source_name,
+ segmented_data=segmented_data,
+ )
+ )
+
+ return responses
diff --git a/obsei_module/obsei-master/obsei/analyzer/ner_analyzer.py b/obsei_module/obsei-master/obsei/analyzer/ner_analyzer.py
new file mode 100644
index 0000000000000000000000000000000000000000..847f722cbfe0c5fcf835286d08df4d106de1849e
--- /dev/null
+++ b/obsei_module/obsei-master/obsei/analyzer/ner_analyzer.py
@@ -0,0 +1,165 @@
+import logging
+from typing import Any, Dict, Generator, List, Optional, Tuple, Iterator
+from pydantic import PrivateAttr
+from transformers import (
+ AutoModelForTokenClassification,
+ AutoTokenizer,
+ Pipeline,
+ pipeline,
+)
+import spacy
+from spacy.language import Language
+from spacy.tokens.doc import Doc
+from obsei.analyzer.base_analyzer import (
+ BaseAnalyzer,
+ BaseAnalyzerConfig,
+ MAX_LENGTH,
+)
+from obsei.payload import TextPayload
+
+logger = logging.getLogger(__name__)
+
+
+class TransformersNERAnalyzer(BaseAnalyzer):
+ _pipeline: Pipeline = PrivateAttr()
+ _max_length: int = PrivateAttr()
+ TYPE: str = "NER"
+ model_name_or_path: str
+ tokenizer_name: Optional[str] = None
+ grouped_entities: Optional[bool] = True
+
+ def __init__(self, **data: Any):
+ super().__init__(**data)
+
+ model = AutoModelForTokenClassification.from_pretrained(self.model_name_or_path)
+ tokenizer = AutoTokenizer.from_pretrained(
+ self.tokenizer_name if self.tokenizer_name else self.model_name_or_path,
+ use_fast=True,
+ )
+
+ self._pipeline = pipeline(
+ "ner",
+ model=model,
+ tokenizer=tokenizer,
+ grouped_entities=self.grouped_entities,
+ device=self._device_id,
+ )
+
+ if hasattr(self._pipeline.model.config, "max_position_embeddings"):
+ self._max_length = self._pipeline.model.config.max_position_embeddings
+ else:
+ self._max_length = MAX_LENGTH
+
+ def _prediction_from_model(self, texts: List[str]) -> List[List[Dict[str, float]]]:
+ prediction = self._pipeline(texts)
+ return ( # type: ignore[no-any-return]
+ prediction
+ if len(prediction) and isinstance(prediction[0], list)
+ else [prediction]
+ )
+
+ def analyze_input(
+ self,
+ source_response_list: List[TextPayload],
+ analyzer_config: Optional[BaseAnalyzerConfig] = None,
+ **kwargs: Any,
+ ) -> List[TextPayload]:
+ analyzer_output: List[TextPayload] = []
+
+ for batch_responses in self.batchify(source_response_list, self.batch_size):
+ texts = [
+ source_response.processed_text[: self._max_length]
+ for source_response in batch_responses
+ ]
+ batch_predictions = self._prediction_from_model(texts)
+
+ for prediction, source_response in zip(batch_predictions, batch_responses):
+ segmented_data = {"ner_data": prediction}
+ if source_response.segmented_data:
+ segmented_data = {
+ **segmented_data,
+ **source_response.segmented_data,
+ }
+
+ analyzer_output.append(
+ TextPayload(
+ processed_text=source_response.processed_text,
+ meta=source_response.meta,
+ segmented_data=segmented_data,
+ source_name=source_response.source_name,
+ )
+ )
+ return analyzer_output
+
+
+class SpacyNERAnalyzer(BaseAnalyzer):
+ _nlp: Language = PrivateAttr()
+ TYPE: str = "NER"
+ model_name_or_path: str
+ tokenizer_name: Optional[str] = None
+ grouped_entities: Optional[bool] = True
+ n_process: int = 1
+
+ def __init__(self, **data: Any):
+ super().__init__(**data)
+ self._nlp = spacy.load(
+ self.model_name_or_path,
+ disable=["tagger", "parser", "attribute_ruler", "lemmatizer"],
+ )
+
+ def _spacy_pipe_batchify(
+ self,
+ texts: List[str],
+ batch_size: int,
+ source_response_list: List[TextPayload],
+ ) -> Generator[Tuple[Iterator[Doc], List[TextPayload]], None, None]:
+ for index in range(0, len(texts), batch_size):
+ yield (
+ self._nlp.pipe(
+ texts=texts[index: index + batch_size],
+ batch_size=batch_size,
+ n_process=self.n_process,
+ ),
+ source_response_list[index: index + batch_size],
+ )
+
+ def analyze_input(
+ self,
+ source_response_list: List[TextPayload],
+ analyzer_config: Optional[BaseAnalyzerConfig] = None,
+ **kwargs: Any,
+ ) -> List[TextPayload]:
+ analyzer_output: List[TextPayload] = []
+ texts = [
+ source_response.processed_text for source_response in source_response_list
+ ]
+
+ for batch_docs, batch_source_response in self._spacy_pipe_batchify(
+ texts, self.batch_size, source_response_list
+ ):
+ for doc, source_response in zip(batch_docs, batch_source_response):
+ ner_prediction = [
+ {
+ "entity_group": ent.label_,
+ "word": ent.text,
+ "start": ent.start_char,
+ "end": ent.end_char,
+ }
+ for ent in doc.ents
+ ]
+ segmented_data = {"ner_data": ner_prediction}
+ if source_response.segmented_data:
+ segmented_data = {
+ **segmented_data,
+ **source_response.segmented_data,
+ }
+ analyzer_output.append(
+ TextPayload(
+ processed_text=source_response.processed_text,
+ meta=source_response.meta,
+ segmented_data=segmented_data,
+ source_name=source_response.source_name,
+ )
+ )
+
+ return analyzer_output
diff --git a/obsei_module/obsei-master/obsei/analyzer/pii_analyzer.py b/obsei_module/obsei-master/obsei/analyzer/pii_analyzer.py
new file mode 100644
index 0000000000000000000000000000000000000000..e3826c7b1491a938c4399d55f1f74dc00c9e2475
--- /dev/null
+++ b/obsei_module/obsei-master/obsei/analyzer/pii_analyzer.py
@@ -0,0 +1,191 @@
+import logging
+from typing import Any, Dict, List, Optional
+
+from presidio_analyzer import AnalyzerEngine, EntityRecognizer
+from presidio_anonymizer import AnonymizerEngine
+from presidio_analyzer.nlp_engine import NlpEngineProvider
+from presidio_anonymizer.entities.engine import OperatorConfig
+from pydantic import BaseModel, Field, PrivateAttr
+
+from obsei.analyzer.base_analyzer import (
+ BaseAnalyzer,
+ BaseAnalyzerConfig,
+)
+from obsei.payload import TextPayload
+
+logger = logging.getLogger(__name__)
+
+
+class PresidioModelConfig(BaseModel):
+ lang_code: Optional[str] = Field("en")
+ model_name: Optional[str] = Field("en_core_web_lg")
+
+
+class PresidioEngineConfig(BaseModel):
+ nlp_engine_name: Optional[str] = Field("spacy")
+ models: Optional[List[PresidioModelConfig]] = None
+
+ def __init__(self, **data: Any):
+ super().__init__(**data)
+
+ if not self.models or len(self.models) == 0:
+ self.models = [PresidioModelConfig()]
+
+
+class PresidioAnonymizerConfig(OperatorConfig, BaseModel): # type: ignore
+ def __init__(self, anonymizer_name: str, params: Optional[Dict[str, Any]] = None):
+ super().__init__(anonymizer_name=anonymizer_name, params=params)
+
+ class Config:
+ arbitrary_types_allowed = True
+
+
+class PresidioPIIAnalyzerConfig(BaseAnalyzerConfig):
+ TYPE: str = "PresidioPII"
+ # To find more details refer https://microsoft.github.io/presidio/anonymizer/
+ anonymizers_config: Optional[Dict[str, PresidioAnonymizerConfig]] = None
+ # To see list of supported entities refer https://microsoft.github.io/presidio/supported_entities/
+ # By default it will search for all the supported entities
+ entities: Optional[List[str]] = None
+ analyze_only: Optional[bool] = False
+ replace_original_text: Optional[bool] = True
+ # Whether the analysis decision process steps returned in the response
+ return_decision_process: Optional[bool] = False
+
+
+class PresidioPIIAnalyzer(BaseAnalyzer):
+ _analyzer: AnalyzerEngine = PrivateAttr()
+ _anonymizer: AnonymizerEngine = PrivateAttr()
+ TYPE: str = "PresidioPII"
+ engine_config: Optional[PresidioEngineConfig] = None
+ # To see list of supported entities refer https://microsoft.github.io/presidio/supported_entities/
+ # To add customer recognizers refer https://microsoft.github.io/presidio/analyzer/adding_recognizers/
+ entity_recognizers: Optional[List[EntityRecognizer]] = None
+ # To find more details refer https://microsoft.github.io/presidio/anonymizer/
+ anonymizers_config: Optional[Dict[str, OperatorConfig]] = None
+
+ def __init__(self, **data: Any):
+ super().__init__(**data)
+
+ if not self.engine_config:
+ self.engine_config = PresidioEngineConfig()
+
+ if not self.engine_config.models or len(self.engine_config.models) == 0:
+ self.engine_config.models = [PresidioModelConfig()]
+
+ # If spacy engine then load Spacy models and select languages
+ languages = []
+ for model_config in self.engine_config.models:
+ languages.append(model_config.lang_code)
+
+ # Check SpacyNlpEngine.engine_name
+ if (
+ self.engine_config.nlp_engine_name == "spacy"
+ and model_config.model_name is not None
+ ):
+ try:
+ spacy_model = __import__(model_config.model_name)
+ spacy_model.load()
+ logger.info(
+ f"Spacy model {model_config.model_name} is already downloaded"
+ )
+ except:
+ logger.warning(
+ f"Spacy model {model_config.model_name} is not downloaded"
+ )
+ logger.warning(
+ f"Downloading spacy model {model_config.model_name}, it might take some time"
+ )
+ from spacy.cli import download # type: ignore
+
+ download(model_config.model_name)
+
+ # Create NLP engine based on configuration
+ provider = NlpEngineProvider(nlp_configuration=self.engine_config.dict())
+ nlp_engine = provider.create_engine()
+
+ # Pass the created NLP engine and supported_languages to the AnalyzerEngine
+ self._analyzer = AnalyzerEngine(
+ nlp_engine=nlp_engine, supported_languages=languages
+ )
+
+ # self._analyzer.registry.load_predefined_recognizers()
+ if self.entity_recognizers:
+ for entity_recognizer in self.entity_recognizers:
+ self._analyzer.registry.add_recognizer(entity_recognizer)
+
+ # Initialize the anonymizer with logger
+ self._anonymizer = AnonymizerEngine()
+
+ def analyze_input( # type: ignore[override]
+ self,
+ source_response_list: List[TextPayload],
+ analyzer_config: Optional[PresidioPIIAnalyzerConfig] = None,
+ language: Optional[str] = "en",
+ **kwargs: Any,
+ ) -> List[TextPayload]:
+ if analyzer_config is None:
+ raise ValueError("analyzer_config can't be None")
+
+ analyzer_output: List[TextPayload] = []
+
+ for batch_responses in self.batchify(source_response_list, self.batch_size):
+ for source_response in batch_responses:
+ analyzer_result = self._analyzer.analyze(
+ text=source_response.processed_text,
+ entities=analyzer_config.entities,
+ return_decision_process=analyzer_config.return_decision_process,
+ language=language,
+ )
+
+ anonymized_result = None
+ if not analyzer_config.analyze_only:
+ anonymizers_config = (
+ analyzer_config.anonymizers_config or self.anonymizers_config
+ )
+
+ if (
+ source_response.processed_text is not None
+ and len(source_response.processed_text) > 0
+ ):
+ anonymized_result = self._anonymizer.anonymize(
+ text=source_response.processed_text,
+ operators=anonymizers_config,
+ analyzer_results=analyzer_result,
+ )
+
+ if (
+ analyzer_config.replace_original_text
+ and anonymized_result is not None
+ ):
+ text = anonymized_result.text
+ else:
+ text = source_response.processed_text
+
+ segmented_data = {
+ "pii_data": {
+ "analyzer_result": [vars(result) for result in analyzer_result],
+ "anonymized_result": None
+ if not anonymized_result
+ else [vars(item) for item in anonymized_result.items],
+ "anonymized_text": None
+ if not anonymized_result
+ else anonymized_result.text,
+ }
+ }
+ if source_response.segmented_data:
+ segmented_data = {
+ **segmented_data,
+ **source_response.segmented_data,
+ }
+
+ analyzer_output.append(
+ TextPayload(
+ processed_text=text,
+ meta=source_response.meta,
+ segmented_data=segmented_data,
+ source_name=source_response.source_name,
+ )
+ )
+
+ return analyzer_output
diff --git a/obsei_module/obsei-master/obsei/analyzer/sentiment_analyzer.py b/obsei_module/obsei-master/obsei/analyzer/sentiment_analyzer.py
new file mode 100644
index 0000000000000000000000000000000000000000..b03baa8b27721ef4d1fa65bd05675088bb59c68e
--- /dev/null
+++ b/obsei_module/obsei-master/obsei/analyzer/sentiment_analyzer.py
@@ -0,0 +1,94 @@
+import logging
+from typing import Any, List, Optional
+
+from pydantic import PrivateAttr
+from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
+
+from obsei.analyzer.base_analyzer import (
+ BaseAnalyzer,
+ BaseAnalyzerConfig,
+)
+from obsei.payload import TextPayload
+from obsei.analyzer.classification_analyzer import (
+ ClassificationAnalyzerConfig,
+ ZeroShotClassificationAnalyzer,
+)
+
+logger = logging.getLogger(__name__)
+
+
+class VaderSentimentAnalyzer(BaseAnalyzer):
+ _model: SentimentIntensityAnalyzer = PrivateAttr()
+ TYPE: str = "Sentiment"
+
+ def __init__(self, **data: Any):
+ super().__init__(**data)
+ self._model = SentimentIntensityAnalyzer()
+
+ def _get_sentiment_score_from_vader(self, text: str) -> float:
+ scores = self._model.polarity_scores(text)
+ return float(scores["compound"])
+
+ def analyze_input(
+ self,
+ source_response_list: List[TextPayload],
+ analyzer_config: Optional[BaseAnalyzerConfig] = None,
+ **kwargs: Any,
+ ) -> List[TextPayload]:
+ analyzer_output: List[TextPayload] = []
+
+ for batch_responses in self.batchify(source_response_list, self.batch_size):
+ for source_response in batch_responses:
+ classification_map = {}
+ sentiment_value = self._get_sentiment_score_from_vader(
+ source_response.processed_text
+ )
+ if sentiment_value < 0.0:
+ classification_map["negative"] = -sentiment_value
+ classification_map["positive"] = (
+ 1.0 - classification_map["negative"]
+ )
+ else:
+ classification_map["positive"] = sentiment_value
+ classification_map["negative"] = (
+ 1.0 - classification_map["positive"]
+ )
+
+ segmented_data = {"classifier_data": classification_map}
+ if source_response.segmented_data:
+ segmented_data = {
+ **segmented_data,
+ **source_response.segmented_data,
+ }
+
+ analyzer_output.append(
+ TextPayload(
+ processed_text=source_response.processed_text,
+ meta=source_response.meta,
+ segmented_data=segmented_data,
+ source_name=source_response.source_name,
+ )
+ )
+
+ return analyzer_output
+
+
+class TransformersSentimentAnalyzerConfig(ClassificationAnalyzerConfig):
+ TYPE: str = "Sentiment"
+ labels: List[str] = ["positive", "negative"]
+ multi_class_classification: bool = False
+
+
+class TransformersSentimentAnalyzer(ZeroShotClassificationAnalyzer):
+ def analyze_input( # type: ignore[override]
+ self,
+ source_response_list: List[TextPayload],
+ analyzer_config: Optional[TransformersSentimentAnalyzerConfig] = None,
+ **kwargs: Any,
+ ) -> List[TextPayload]:
+ return super().analyze_input(
+ source_response_list=source_response_list,
+ analyzer_config=analyzer_config,
+ add_positive_negative_labels=True,
+ **kwargs,
+ )
diff --git a/obsei_module/obsei-master/obsei/analyzer/test2.py b/obsei_module/obsei-master/obsei/analyzer/test2.py
new file mode 100644
index 0000000000000000000000000000000000000000..d9ba2cc168ce0da9def9b0c15f3018c467d19843
--- /dev/null
+++ b/obsei_module/obsei-master/obsei/analyzer/test2.py
@@ -0,0 +1,34 @@
+from obsei.payload import TextPayload
+from transformers import pipeline
+from obsei.analyzer.classification_analyzer import ZeroShotClassificationAnalyzer, ClassificationAnalyzerConfig
+
+# Step 1: Define your configuration (labels, etc.)
+analyzer_config = ClassificationAnalyzerConfig(
+ labels=["Sports", "Politics", "Technology", "Entertainment"], # Example labels
+ multi_class_classification=False,
+ add_positive_negative_labels=False
+)
+
+# Step 2: Initialize the ZeroShotClassificationAnalyzer
+analyzer = ZeroShotClassificationAnalyzer(
+ model_name_or_path="facebook/bart-large-mnli", # Using a pre-trained zero-shot classification model
+ device="cpu" # Use GPU, set to "cpu" if using CPU
+)
+
+# Step 3: Prepare the input text (as TextPayload objects)
+texts = [
+ "The new iPhone has been released and it's taking the tech world by storm.",
+ "The latest political debate had strong views on the economy.",
+ "The football match between Barcelona and Madrid ended in a draw."
+]
+
+# Create TextPayloads from the texts
+source_responses = [TextPayload(processed_text=text) for text in texts]
+
+# Step 4: Run the analysis
+results = analyzer.analyze_input(source_response_list=source_responses, analyzer_config=analyzer_config)
+
+# Step 5: Output the results
+for result in results:
+ print(f"Text: {result.processed_text}")
+ print(f"Classification Scores: {result.segmented_data['classifier_data']}")
diff --git a/obsei_module/obsei-master/obsei/analyzer/translation_analyzer.py b/obsei_module/obsei-master/obsei/analyzer/translation_analyzer.py
new file mode 100644
index 0000000000000000000000000000000000000000..45d71e9643b1fc5d14c2437645e84f10244d8a1a
--- /dev/null
+++ b/obsei_module/obsei-master/obsei/analyzer/translation_analyzer.py
@@ -0,0 +1,70 @@
+from typing import Any, List, Optional
+
+from pydantic import PrivateAttr
+from transformers import pipeline, Pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
+
+from obsei.analyzer.base_analyzer import (
+ BaseAnalyzer,
+ BaseAnalyzerConfig,
+ MAX_LENGTH,
+)
+from obsei.payload import TextPayload
+
+
+class TranslationAnalyzer(BaseAnalyzer):
+ _pipeline: Pipeline = PrivateAttr()
+ _max_length: int = PrivateAttr()
+ TYPE: str = "Translation"
+ model_name_or_path: str
+
+ def __init__(self, **data: Any):
+ super().__init__(**data)
+ tokenizer = AutoTokenizer.from_pretrained(self.model_name_or_path)
+ model = AutoModelForSeq2SeqLM.from_pretrained(self.model_name_or_path)
+ self._pipeline = pipeline(
+ "translation", model=model, tokenizer=tokenizer, device=self._device_id
+ )
+ if hasattr(self._pipeline.model.config, "max_position_embeddings"):
+ self._max_length = self._pipeline.model.config.max_position_embeddings
+ else:
+ self._max_length = MAX_LENGTH
+
+ def analyze_input(
+ self,
+ source_response_list: List[TextPayload],
+ analyzer_config: Optional[BaseAnalyzerConfig] = None,
+ **kwargs: Any,
+ ) -> List[TextPayload]:
+
+ analyzer_output = []
+
+ for batch_responses in self.batchify(source_response_list, self.batch_size):
+ texts = [
+ source_response.processed_text[: self._max_length]
+ for source_response in batch_responses
+ ]
+
+ batch_predictions = self._pipeline(texts)
+
+ for prediction, source_response in zip(batch_predictions, batch_responses):
+ segmented_data = {
+ "translation_data": {
+ "original_text": source_response.processed_text
+ }
+ }
+ if source_response.segmented_data:
+ segmented_data = {
+ **segmented_data,
+ **source_response.segmented_data,
+ }
+
+ analyzer_output.append(
+ TextPayload(
+ processed_text=prediction["translation_text"],
+ meta=source_response.meta,
+ segmented_data=segmented_data,
+ source_name=source_response.source_name,
+ )
+ )
+
+ return analyzer_output
diff --git a/obsei_module/obsei-master/obsei/configuration.py b/obsei_module/obsei-master/obsei/configuration.py
new file mode 100644
index 0000000000000000000000000000000000000000..cd55fb90e4fcff3236f7e6760e5affdd1974b936
--- /dev/null
+++ b/obsei_module/obsei-master/obsei/configuration.py
@@ -0,0 +1,37 @@
+import logging
+from typing import Any, Dict, Optional, Union
+
+import yaml
+from pydantic import Field
+from pydantic_settings import BaseSettings
+
+from obsei.misc.utils import dict_to_object
+
+logger = logging.getLogger(__name__)
+
+
+class ObseiConfiguration(BaseSettings):
+ configuration: Optional[Dict[str, Any]] = None
+ config_path: Optional[str] = Field(None, env="obsei_config_path")
+ config_filename: Optional[str] = Field(None, env="obsei_config_filename")
+
+ def __init__(self, **data: Any):
+ super().__init__(**data)
+
+ if self.configuration is None:
+ self.configuration = yaml.load(
+ open(f"{self.config_path}/{self.config_filename}", "r"),
+ Loader=yaml.FullLoader,
+ )
+ logger.debug(f"Configuration: {self.configuration}")
+
+ def initialize_instance(self, key_name: Optional[str] = None) -> Union[Any]:
+ if (
+ key_name is None
+ or self.configuration is None
+ or key_name not in self.configuration
+ or not self.configuration[key_name]
+ ):
+ logger.warning(f"{key_name} not exist in configuration")
+ return None
+ return dict_to_object(self.configuration[key_name])
diff --git a/obsei_module/obsei-master/obsei/misc/__init__.py b/obsei_module/obsei-master/obsei/misc/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/obsei_module/obsei-master/obsei/misc/gpu_util.py b/obsei_module/obsei-master/obsei/misc/gpu_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..fc442f1d8fbee20066ffaf520ecfb26964636e23
--- /dev/null
+++ b/obsei_module/obsei-master/obsei/misc/gpu_util.py
@@ -0,0 +1,18 @@
+import torch
+
+
+def is_gpu_available() -> bool:
+ return torch.cuda.is_available()
+
+
+def get_device_id(device: str) -> int:
+ if device == "cpu":
+ return -1
+ elif device == "auto":
+ return 0 if is_gpu_available() else -1
+ elif device.startswith("cuda:"):
+ device_no = device.replace("cuda:", "")
+ if device_no.isnumeric():
+ return int(device_no)
+
+ raise Exception(f"Invalid device: '{device}'")
diff --git a/obsei_module/obsei-master/obsei/misc/utils.py b/obsei_module/obsei-master/obsei/misc/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..e192578ed957b5946cc3f3f1397041040fc8b054
--- /dev/null
+++ b/obsei_module/obsei-master/obsei/misc/utils.py
@@ -0,0 +1,210 @@
+import json
+import math
+import time
+
+import dateparser
+from datetime import datetime, timezone
+from importlib import import_module
+from typing import Any, Dict, Optional, Union
+
+from bs4 import BeautifulSoup
+from bs4.element import Comment
+from dateutil.relativedelta import relativedelta
+
+DATETIME_STRING_PATTERN = "%Y-%m-%dT%H:%M:%SZ"
+DEFAULT_LOOKUP_PERIOD = "1h"
+
+
+# Used from https://stackoverflow.com/a/52081812 and modified
+def flatten_dict(
+ dictionary: Dict[str, Any],
+ round_the_float: bool = True,
+ float_round_format_str: str = ".2f",
+ separator: str = "_",
+) -> Dict[str, Any]:
+ out: Dict[str, Any] = {}
+ for key, val in dictionary.items():
+ if isinstance(val, dict):
+ val = [val]
+ if isinstance(val, list):
+ for sub_dict in val:
+ deeper = flatten_dict(sub_dict).items()
+ out.update({key + separator + key2: val2 for key2, val2 in deeper})
+ elif isinstance(val, float) and round_the_float:
+ out[key] = format(val, float_round_format_str)
+ else:
+ out[key] = val
+ return out
+
+
+def obj_to_json(obj: Any, sort_keys: bool = False, indent: Optional[int] = None) -> Union[bytes, None]:
+ if obj is None:
+ return None
+ return json.dumps(
+ obj,
+ default=datetime_handler,
+ ensure_ascii=False,
+ sort_keys=sort_keys,
+ indent=indent,
+ ).encode("utf8")
+
+
+def obj_to_markdown(
+ obj: Any,
+ level: int = 1,
+ str_enclose_start: Optional[str] = None,
+ str_enclose_end: Optional[str] = None,
+) -> str:
+ key_prefix = "*" * level
+
+ markdowns = []
+ if is_collection(obj):
+ add_key = True
+ if hasattr(obj, "__dict__"):
+ item_view = obj.__dict__.items()
+ elif isinstance(obj, dict):
+ item_view = obj.items()
+ else:
+ add_key = False
+ item_view = enumerate(obj)
+
+ for key, val in item_view:
+ if add_key:
+ header = f"{key_prefix} {key}"
+ else:
+ header = key_prefix
+ if is_collection(val):
+ child_markdown = obj_to_markdown(
+ obj=val,
+ level=level + 1,
+ str_enclose_start=str_enclose_start,
+ str_enclose_end=str_enclose_end,
+ )
+ markdowns.append(f"{header}\n{child_markdown}")
+ elif str_enclose_start is not None and isinstance(val, str):
+ markdowns.append(
+ f"{header}:\n{str_enclose_start}{val}{str_enclose_end}"
+ )
+ else:
+ markdowns.append(f"{header}: {val}")
+ elif str_enclose_start is not None and isinstance(obj, str):
+ markdowns.append(f"{key_prefix}:\n{str_enclose_start}{obj}{str_enclose_end}")
+ else:
+ markdowns.append(f"{key_prefix}: {obj}")
+
+ return "\n".join(markdowns)
+
+
+def is_collection(obj: Any) -> bool:
+ return isinstance(obj, (dict, list)) or hasattr(obj, "__dict__")
+
+
+# Copied from searchtweets-v2 and bit modified
+def convert_utc_time(datetime_str: str) -> datetime:
+ """
+ Handles datetime argument conversion to the Labs API format, which is
+ `YYYY-MM-DDTHH:mm:ssZ`.
+ Flexible passing of date formats in the following types::
+
+ - YYYYmmDDHHMM
+ - YYYY-mm-DD
+ - YYYY-mm-DD HH:MM
+ - YYYY-mm-DDTHH:MM
+ - 2m (set start_time to two months ago)
+ - 3d (set start_time to three days ago)
+ - 12h (set start_time to twelve hours ago)
+ - 15m (set start_time to fifteen minutes ago)
+
+ Args:
+ datetime_str (str): valid formats are listed above.
+
+ Returns:
+ string of ISO formatted date.
+ """
+ try:
+ if len(datetime_str) <= 5:
+ _date = datetime.utcnow()
+ # parse out numeric character.
+ num = int(datetime_str[:-1])
+ if "d" in datetime_str:
+ _date = _date + relativedelta(days=-num)
+ elif "h" in datetime_str:
+ _date = _date + relativedelta(hours=-num)
+ elif "m" in datetime_str:
+ _date = _date + relativedelta(minutes=-num)
+ elif "M" in datetime_str:
+ _date = _date + relativedelta(months=-num)
+ elif "Y" in datetime_str:
+ _date = _date + relativedelta(years=-num)
+ elif not {"-", ":"} & set(datetime_str):
+ _date = datetime.strptime(datetime_str, "%Y%m%d%H%M")
+ elif "T" in datetime_str:
+ _date = datetime.strptime(datetime_str, DATETIME_STRING_PATTERN)
+ else:
+ _date = datetime.strptime(datetime_str, "%Y-%m-%d %H:%M")
+
+ except ValueError:
+ _date = datetime.strptime(datetime_str, "%Y-%m-%d")
+
+ return _date.replace(tzinfo=timezone.utc)
+
+
+def convert_datetime_str_to_epoch(datetime_str: str) -> Optional[int]:
+ parsed_datetime = dateparser.parse(datetime_str)
+ if not parsed_datetime:
+ return None
+ unix_timestamp = time.mktime(parsed_datetime.timetuple())
+ return math.trunc(unix_timestamp)
+
+
+def tag_visible(element: Any) -> bool:
+ if element.parent.name in [
+ "style",
+ "script",
+ "head",
+ "title",
+ "meta",
+ "[document]",
+ ]:
+ return False
+ if isinstance(element, Comment):
+ return False
+ return True
+
+
+def text_from_html(body: Union[str, bytes]) -> str:
+ soup = BeautifulSoup(body, "html.parser")
+ texts = soup.findAll(text=True)
+ visible_texts = filter(tag_visible, texts)
+ return " ".join(t.strip() for t in visible_texts)
+
+
+def dict_to_object(
+ dictionary: Dict[str, Any],
+ class_name_key: Optional[str] = "_target_",
+ full_class_name: Optional[str] = None,
+) -> Any:
+ new_dict: Dict[str, Any] = dict()
+ for k, v in dictionary.items():
+ if k == class_name_key:
+ full_class_name = v
+ elif isinstance(v, Dict):
+ new_dict[k] = dict_to_object(dictionary=v, class_name_key=class_name_key)
+ else:
+ new_dict[k] = v
+
+ if full_class_name is None:
+ return new_dict
+
+ module_name, class_name = tuple(full_class_name.rsplit(".", 1))
+ module = import_module(module_name)
+ class_ref = getattr(module, class_name)
+ return class_ref(**new_dict)
+
+
+def datetime_handler(x: Any) -> Optional[Any]:
+ if x is None:
+ return None
+ elif isinstance(x, datetime):
+ return x.isoformat()
+ return vars(x) if hasattr(x, "__dict__") else x
diff --git a/obsei_module/obsei-master/obsei/misc/web_search.py b/obsei_module/obsei-master/obsei/misc/web_search.py
new file mode 100644
index 0000000000000000000000000000000000000000..4b8f92c8c76226b05102c14cc992d1b323e39eaa
--- /dev/null
+++ b/obsei_module/obsei-master/obsei/misc/web_search.py
@@ -0,0 +1,35 @@
+from typing import Any, Dict, List, Optional
+
+import requests
+from requests.adapters import HTTPAdapter
+from urllib3 import Retry
+
+GOOGLE_SEARCH_URL = "https://www.google.com/search"
+
+
+# Code is influenced from https://github.com/cowboy-bebug/app-store-scraper
+def perform_search(
+ request_url: str,
+ query: str,
+ search_url: str = GOOGLE_SEARCH_URL,
+ search_country: Optional[str] = None,
+ headers: Optional[Dict[str, Any]] = None,
+ total: int = 3,
+ backoff_factor: int = 3,
+ status_force_list: Optional[List[int]] = None,
+) -> requests.Response:
+
+ params = {"q": query}
+ if search_country:
+ params["cr"] = search_country
+
+ if not status_force_list:
+ status_force_list = [404, 429]
+ retries = Retry(
+ total=total,
+ backoff_factor=backoff_factor,
+ status_forcelist=status_force_list,
+ )
+ with requests.Session() as s:
+ s.mount(request_url, HTTPAdapter(max_retries=retries))
+ return s.get(search_url, headers=headers, params=params)
diff --git a/obsei_module/obsei-master/obsei/misc/youtube_reviews_scrapper.py b/obsei_module/obsei-master/obsei/misc/youtube_reviews_scrapper.py
new file mode 100644
index 0000000000000000000000000000000000000000..090c2e3d529b858856155d508eee5bb9f2ac3158
--- /dev/null
+++ b/obsei_module/obsei-master/obsei/misc/youtube_reviews_scrapper.py
@@ -0,0 +1,169 @@
+# Code in this file is copied from https://github.com/egbertbouman/youtube-comment-downloader/blob/master/youtube_comment_downloader/downloader.py
+# and modified to fit the needs of this project. When code from youtube-comment-downloader was copied it was MIT licensed.
+# Code Commit: https://github.com/egbertbouman/youtube-comment-downloader/commit/9a15b8e3fbaebad660875409fb1bbe74db17f304
+
+import json
+import logging
+import time
+import re
+from datetime import datetime, timezone
+
+import dateparser
+from typing import Optional, Any, List, Dict, Generator
+
+import requests
+from pydantic import BaseModel
+from requests import Session
+
+logger = logging.getLogger(__name__)
+
+
+class YouTubeCommentExtractor(BaseModel):
+ _YT_URL: str = 'https://www.youtube.com'
+ _YT_CFG_REGEX: str = r'ytcfg\.set\s*\(\s*({.+?})\s*\)\s*;'
+ _YT_INITIAL_DATA_REGEX: str = r'(?:window\s*\[\s*["\']ytInitialData["\']\s*\]|ytInitialData)\s*=\s*({.+?})\s*;\s*(?:var\s+meta| str:
+ match = re.search(pattern, text)
+ return match.group(group) if match else ''
+
+ def _ajax_request(self, session: Session, endpoint: Dict[str, Any], ytcfg: Dict[str, Any]) -> Any:
+ url = self._YT_URL + endpoint['commandMetadata']['webCommandMetadata']['apiUrl']
+
+ data = {'context': ytcfg['INNERTUBE_CONTEXT'],
+ 'continuation': endpoint['continuationCommand']['token']}
+
+ for _ in range(self.request_retries):
+ response = session.post(url, params={'key': ytcfg['INNERTUBE_API_KEY']}, json=data)
+ if response.status_code == 200:
+ return response.json()
+ if response.status_code in [403, 413]:
+ return {}
+ else:
+ time.sleep(self.sleep_time)
+
+ @staticmethod
+ def _search_dict(partial: Any, search_key: str) -> Generator[Any, Any, None]:
+ stack = [partial]
+ while stack:
+ current_item = stack.pop()
+ if isinstance(current_item, dict):
+ for key, value in current_item.items():
+ if key == search_key:
+ yield value
+ else:
+ stack.append(value)
+ elif isinstance(current_item, list):
+ for value in current_item:
+ stack.append(value)
+
+ def _fetch_comments(self, until_datetime: Optional[datetime] = None) -> Generator[Any, Any, None]:
+ session = requests.Session()
+ session.headers['User-Agent'] = self.user_agent
+ response = session.get(self.video_url)
+
+ if response.request and response.request.url and 'uxe=' in response.request.url:
+ session.cookies.set('CONSENT', 'YES+cb', domain='.youtube.com') # type: ignore[no-untyped-call]
+ response = session.get(self.video_url)
+
+ html = response.text
+ ytcfg = json.loads(self._regex_search(html, self._YT_CFG_REGEX))
+ if not ytcfg:
+ return # Unable to extract configuration
+ if self.lang_code:
+ ytcfg['INNERTUBE_CONTEXT']['client']['hl'] = self.lang_code
+
+ data = json.loads(self._regex_search(html, self._YT_INITIAL_DATA_REGEX))
+
+ section = next(self._search_dict(data, 'itemSectionRenderer'), None)
+ renderer = next(self._search_dict(section, 'continuationItemRenderer'), None) if section else None
+ if not renderer:
+ # Comments disabled?
+ return
+
+ needs_sorting = self.sort_by != 0
+ continuations = [renderer['continuationEndpoint']]
+ while continuations:
+ continuation = continuations.pop()
+ response = self._ajax_request(session, continuation, ytcfg)
+
+ if not response:
+ break
+ if list(self._search_dict(response, 'externalErrorMessage')):
+ logger.warning('Error returned from server: %s', next(self._search_dict(response, 'externalErrorMessage')))
+ return
+
+ if needs_sorting:
+ sub_menu: Dict[str, Any] = next(self._search_dict(response, 'sortFilterSubMenuRenderer'), {})
+ sort_menu = sub_menu.get('subMenuItems', [])
+ if self.sort_by < len(sort_menu):
+ continuations = [sort_menu[self.sort_by]['serviceEndpoint']]
+ needs_sorting = False
+ continue
+ # TODO: Fix it. Causing observer to fail silently\
+ logger.warning("Unable to set sorting")
+ # raise RuntimeError('Failed to set sorting')
+
+ actions = list(self._search_dict(response, 'reloadContinuationItemsCommand')) + \
+ list(self._search_dict(response, 'appendContinuationItemsAction'))
+
+ for action in actions:
+ for item in action.get('continuationItems', []):
+ if action['targetId'] == 'comments-section':
+ # Process continuations for comments and replies.
+ continuations[:0] = [ep for ep in self._search_dict(item, 'continuationEndpoint')]
+ if self.fetch_replies:
+ # TODO: Fix it. This functionality is broken
+ if action['targetId'].startswith('comment-replies-item') and 'continuationItemRenderer' in item:
+ # Process the 'Show more replies' button
+ continuations.append(next(self._search_dict(item, 'buttonRenderer'))['command'])
+
+ for comment in reversed(list(self._search_dict(response, 'commentRenderer'))):
+ if not self.fetch_replies and "." in comment['commentId']:
+ continue
+
+ comment_time_string = comment['publishedTimeText']['runs'][0]['text']
+ comment_time_string = comment_time_string or ''
+ comment_time = dateparser.parse(
+ comment_time_string.split('(edited)', 1)[0].strip(),
+ )
+
+ if comment_time:
+ comment_time = comment_time.replace(tzinfo=timezone.utc)
+ if until_datetime and until_datetime > comment_time:
+ return
+
+ yield {'comment_id': comment['commentId'],
+ 'text': ''.join([c['text'] for c in comment['contentText'].get('runs', [])]),
+ 'time': comment_time,
+ 'author': comment.get('authorText', {}).get('simpleText', ''),
+ 'channel': comment['authorEndpoint']['browseEndpoint'].get('browseId', ''),
+ 'votes': comment.get('voteCount', {}).get('simpleText', '0'),
+ 'photo': comment['authorThumbnail']['thumbnails'][-1]['url'],
+ 'heart': next(self._search_dict(comment, 'isHearted'), False)}
+
+ time.sleep(self.sleep_time)
+
+ def fetch_comments(self, until_datetime: Optional[datetime] = None) -> List[Dict[str, Any]]:
+ comments: List[Dict[str, Any]] = []
+ for comment in self._fetch_comments(until_datetime=until_datetime):
+ comments.append(comment)
+ if self.max_comments and self.max_comments == len(comments):
+ break
+
+ return comments
diff --git a/obsei_module/obsei-master/obsei/payload.py b/obsei_module/obsei-master/obsei/payload.py
new file mode 100644
index 0000000000000000000000000000000000000000..c9673382658962c4a0427f66480ff58674e1219c
--- /dev/null
+++ b/obsei_module/obsei-master/obsei/payload.py
@@ -0,0 +1,33 @@
+from typing import Any, Dict, Optional
+
+from pydantic import BaseModel, Field
+
+
+class BasePayload(BaseModel):
+ segmented_data: Dict[str, Any] = Field({})
+ meta: Dict[str, Any] = Field({})
+ source_name: Optional[str] = "Undefined"
+
+ class Config:
+ arbitrary_types_allowed = True
+
+
+class TextPayload(BasePayload):
+ processed_text: str
+
+ def to_dict(self) -> Dict[str, Any]:
+ return {
+ "processed_text": self.processed_text,
+ "segmented_data": self.segmented_data,
+ "meta": self.meta,
+ "source_name": self.source_name,
+ }
+
+ def is_contains_classification_payload(self) -> bool:
+ if self.segmented_data:
+ if "classifier_data" in self.segmented_data:
+ return True
+ return False
+
+ class Config:
+ arbitrary_types_allowed = True
diff --git a/obsei_module/obsei-master/obsei/postprocessor/__init__.py b/obsei_module/obsei-master/obsei/postprocessor/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/obsei_module/obsei-master/obsei/postprocessor/base_postprocessor.py b/obsei_module/obsei-master/obsei/postprocessor/base_postprocessor.py
new file mode 100644
index 0000000000000000000000000000000000000000..105b67f8fb3e816ba65b4434ed7f4e27211f8b56
--- /dev/null
+++ b/obsei_module/obsei-master/obsei/postprocessor/base_postprocessor.py
@@ -0,0 +1,26 @@
+from typing import List, Any
+
+from pydantic_settings import BaseSettings
+
+from obsei.payload import TextPayload
+from abc import abstractmethod
+
+
+class BasePostprocessorConfig(BaseSettings):
+ TYPE: str = "Base"
+
+ class Config:
+ multi_label = True
+
+
+class BasePostprocessor(BaseSettings):
+ TYPE: str = "Base"
+
+ @abstractmethod
+ def postprocess_input(
+ self, input_list: List[TextPayload], config: BasePostprocessorConfig, **kwargs: Any
+ ) -> List[TextPayload]:
+ pass
+
+ class Config:
+ arbitrary_types_allowed = True
diff --git a/obsei_module/obsei-master/obsei/postprocessor/inference_aggregator.py b/obsei_module/obsei-master/obsei/postprocessor/inference_aggregator.py
new file mode 100644
index 0000000000000000000000000000000000000000..cd1ee12a8b3c06bfff5e28088034fdc941ac7159
--- /dev/null
+++ b/obsei_module/obsei-master/obsei/postprocessor/inference_aggregator.py
@@ -0,0 +1,55 @@
+from typing import List, Optional, Dict, Any
+
+from obsei.payload import TextPayload
+from obsei.postprocessor.base_postprocessor import (
+ BasePostprocessorConfig,
+ BasePostprocessor
+)
+from obsei.postprocessor.inference_aggregator_function import BaseInferenceAggregateFunction
+from obsei.preprocessor.text_splitter import TextSplitterPayload
+
+
+class InferenceAggregatorConfig(BasePostprocessorConfig):
+ aggregate_function: BaseInferenceAggregateFunction
+
+
+class InferenceAggregator(BasePostprocessor):
+ def postprocess_input( # type: ignore[override]
+ self, input_list: List[TextPayload], config: InferenceAggregatorConfig, **kwargs: Any
+ ) -> List[TextPayload]:
+
+ aggregated_payloads = self.segregate_payload(input_list)
+ postproces_output: List[TextPayload] = []
+ for key, payload_list in aggregated_payloads.items():
+ postproces_output.extend(
+ config.aggregate_function.execute(payload_list)
+ )
+
+ return postproces_output
+
+ @staticmethod
+ def segregate_payload(
+ input_list: List[TextPayload],
+ ) -> Dict[str, List[TextPayload]]:
+ segregated_payload: Dict[str, List[TextPayload]] = {}
+
+ # segregate payload
+ for idx, payload in enumerate(input_list):
+ splitter_data: Optional[TextSplitterPayload] = (
+ payload.meta.get("splitter", None) if payload.meta else None
+ )
+ doc_id = splitter_data.document_id if splitter_data else str(idx)
+ if doc_id not in segregated_payload:
+ segregated_payload[doc_id] = []
+ segregated_payload[doc_id].append(payload)
+
+ # sort based on chunk id
+ for doc_id, payloads in segregated_payload.items():
+ if (
+ len(payloads) > 0
+ and payloads[0].meta
+ and payloads[0].meta.get("splitter", None)
+ ):
+ payloads.sort(key=lambda x: x.meta["splitter"].chunk_id) # type: ignore[no-any-return]
+
+ return segregated_payload
diff --git a/obsei_module/obsei-master/obsei/postprocessor/inference_aggregator_function.py b/obsei_module/obsei-master/obsei/postprocessor/inference_aggregator_function.py
new file mode 100644
index 0000000000000000000000000000000000000000..d09164b6161c667ff463ff8c1754ed49ef49604b
--- /dev/null
+++ b/obsei_module/obsei-master/obsei/postprocessor/inference_aggregator_function.py
@@ -0,0 +1,127 @@
+import logging
+from abc import abstractmethod
+from typing import Any, Dict, List, Tuple
+
+from pydantic import BaseModel
+
+from obsei.payload import TextPayload
+
+logger = logging.getLogger(__name__)
+
+
+class BaseInferenceAggregateFunction(BaseModel):
+ @abstractmethod
+ def execute(
+ self, input_list: List[TextPayload], **kwargs: Any
+ ) -> List[TextPayload]:
+ pass
+
+ @staticmethod
+ def _extract_merged_parameters(
+ input_list: List[TextPayload],
+ ) -> Tuple[List[str], int, Dict[str, Any]]:
+ document_length: int = 0
+ meta: Dict[str, Any] = {}
+ doc_text: List[str] = []
+ # Merge meta across payload and collect score keys
+ for payload in input_list:
+ document_length += len(payload.processed_text)
+ meta = {**meta, **payload.meta} if payload.meta else meta
+ # Remove splitter key from meta
+ meta.pop("splitter")
+ doc_text.append(payload.processed_text)
+ return doc_text, document_length, meta
+
+
+class ClassificationAverageScore(BaseInferenceAggregateFunction):
+ name: str = "ClassificationAverageScore"
+ default_value: float = 0.0
+
+ def execute(
+ self, input_list: List[TextPayload], **kwargs: Any
+ ) -> List[TextPayload]:
+ if len(input_list) == 0:
+ logger.warning("Can't aggregate empty list")
+ return input_list
+
+ if not input_list[0].is_contains_classification_payload():
+ logger.warning(
+ "ClassificationAverage supports Classification and Sentiment Analyzers only"
+ )
+ return input_list
+
+ default_value = kwargs.get("default_value", self.default_value)
+
+ source_name = input_list[0].source_name
+
+ doc_text, document_length, meta = self._extract_merged_parameters(input_list)
+
+ # Perform average based on chunk length
+ scores: Dict[str, float] = {}
+ for payload in input_list:
+ if payload.segmented_data:
+ for key, value in payload.segmented_data.get("classifier_data", {}).items():
+ ratio = len(payload.processed_text) / document_length
+ scores[key] = scores.get(key, default_value) + value * ratio
+
+ return [
+ TextPayload(
+ processed_text=" ".join(doc_text),
+ meta=meta,
+ segmented_data={
+ "aggregator_data": {
+ "avg_score": scores,
+ "aggregator_name": self.name,
+ }
+ },
+ source_name=source_name,
+ )
+ ]
+
+
+class ClassificationMaxCategories(BaseInferenceAggregateFunction):
+ name: str = "ClassificationMaxCategories"
+ score_threshold: float = 0.5
+
+ def execute(
+ self, input_list: List[TextPayload], **kwargs: Any
+ ) -> List[TextPayload]:
+ if len(input_list) == 0:
+ logger.warning("Can't aggregate empty list")
+ return input_list
+
+ if not input_list[0].is_contains_classification_payload():
+ logger.warning(
+ "ClassificationAverage supports Classification and Sentiment Analyzers only"
+ )
+ return input_list
+
+ score_threshold = kwargs.get("score_threshold", self.score_threshold)
+
+ source_name = input_list[0].source_name
+
+ doc_text, _, meta = self._extract_merged_parameters(input_list)
+
+ max_scores: Dict[str, float] = {}
+ category_count: Dict[str, int] = {}
+ for payload in input_list:
+ if payload.segmented_data:
+ for key, value in payload.segmented_data.get("classifier_data", {}).items():
+ if value > score_threshold:
+ category_count[key] = category_count.get(key, 0) + 1
+ max_scores[key] = max(max_scores.get(key, 0.0), value)
+
+ return [
+ TextPayload(
+ processed_text=" ".join(doc_text),
+ meta=meta,
+ segmented_data={
+ "aggregator_data": {
+ "category_count": category_count,
+ "max_scores": max_scores,
+ "aggregator_name": self.name,
+ }
+ },
+ source_name=source_name,
+ )
+ ]
diff --git a/obsei_module/obsei-master/obsei/preprocessor/__init__.py b/obsei_module/obsei-master/obsei/preprocessor/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/obsei_module/obsei-master/obsei/preprocessor/base_preprocessor.py b/obsei_module/obsei-master/obsei/preprocessor/base_preprocessor.py
new file mode 100644
index 0000000000000000000000000000000000000000..5af9321e5c8466f5f32a14a84b9114083796919e
--- /dev/null
+++ b/obsei_module/obsei-master/obsei/preprocessor/base_preprocessor.py
@@ -0,0 +1,26 @@
+from abc import abstractmethod
+from typing import List, Any
+
+from pydantic_settings import BaseSettings
+
+from obsei.payload import TextPayload
+
+
+class BaseTextProcessorConfig(BaseSettings):
+ TYPE: str = "Base"
+
+ class Config:
+ arbitrary_types_allowed = True
+
+
+class BaseTextPreprocessor(BaseSettings):
+ TYPE: str = "Base"
+
+ @abstractmethod
+ def preprocess_input(
+ self, input_list: List[TextPayload], config: BaseTextProcessorConfig, **kwargs: Any
+ ) -> List[TextPayload]:
+ pass
+
+ class Config:
+ arbitrary_types_allowed = True
diff --git a/obsei_module/obsei-master/obsei/preprocessor/text_cleaner.py b/obsei_module/obsei-master/obsei/preprocessor/text_cleaner.py
new file mode 100644
index 0000000000000000000000000000000000000000..43a95662af3f833e0ec9b82254a095ef36ecf7e6
--- /dev/null
+++ b/obsei_module/obsei-master/obsei/preprocessor/text_cleaner.py
@@ -0,0 +1,76 @@
+import traceback
+import logging
+from typing import List, Any, Optional, Tuple
+
+from obsei.payload import TextPayload
+from obsei.preprocessor.base_preprocessor import (
+ BaseTextPreprocessor,
+ BaseTextProcessorConfig,
+)
+from obsei.preprocessor.text_cleaning_function import TextCleaningFunction, ToLowerCase, RemoveWhiteSpaceAndEmptyToken, \
+ RemovePunctuation, RemoveSpecialChars, DecodeUnicode, RemoveDateTime, ReplaceDomainKeywords, TokenStemming, \
+ RemoveStopWords
+from obsei.preprocessor.text_tokenizer import BaseTextTokenizer, NLTKTextTokenizer
+
+cleaner_logger: logging.Logger = logging.getLogger(__name__)
+
+
+class TextCleanerConfig(BaseTextProcessorConfig):
+ cleaning_functions: Optional[List[TextCleaningFunction]] = None
+ stop_words_language: Optional[str] = "english"
+ stop_words: Optional[List[str]] = None
+ domain_keywords: Optional[Tuple[str, str]] = None
+ disable_tokenization: bool = False
+
+ def __init__(self, **data: Any):
+ super().__init__(**data)
+
+ if not self.cleaning_functions:
+ self.cleaning_functions = [
+ ToLowerCase(),
+ RemoveWhiteSpaceAndEmptyToken(),
+ RemovePunctuation(),
+ RemoveSpecialChars(),
+ DecodeUnicode(),
+ RemoveDateTime(),
+ ReplaceDomainKeywords(domain_keywords=self.domain_keywords),
+ TokenStemming(),
+ RemoveStopWords(
+ language=self.stop_words_language, stop_words=self.stop_words
+ ),
+ RemoveWhiteSpaceAndEmptyToken(),
+ ]
+
+
+class TextCleaner(BaseTextPreprocessor):
+ text_tokenizer: Optional[BaseTextTokenizer] = None
+
+ def __init__(self, **data: Any):
+ super().__init__(**data)
+ self.text_tokenizer = self.text_tokenizer or NLTKTextTokenizer()
+
+ def preprocess_input( # type: ignore[override]
+ self,
+ input_list: List[TextPayload],
+ config: TextCleanerConfig,
+ **kwargs: Any,
+ ) -> List[TextPayload]:
+ if config.cleaning_functions is None:
+ return input_list
+ for input_data in input_list:
+ if self.text_tokenizer is None or config.disable_tokenization:
+ tokens = [input_data.processed_text]
+ else:
+ tokens = self.text_tokenizer.tokenize_text(
+ input_data.processed_text
+ )
+ for cleaning_function in config.cleaning_functions:
+ try:
+ tokens = cleaning_function.execute(tokens)
+ except Exception as ex:
+ cleaner_logger.warning(f"Received exception: {ex}")
+ traceback.print_exc()
+
+ input_data.processed_text = " ".join(tokens)
+
+ return input_list
diff --git a/obsei_module/obsei-master/obsei/preprocessor/text_cleaning_function.py b/obsei_module/obsei-master/obsei/preprocessor/text_cleaning_function.py
new file mode 100644
index 0000000000000000000000000000000000000000..0ec4b98986fbfd3839d86ff486e5928705ba92ce
--- /dev/null
+++ b/obsei_module/obsei-master/obsei/preprocessor/text_cleaning_function.py
@@ -0,0 +1,176 @@
+import logging
+import re
+import string
+from abc import abstractmethod
+from typing import Any, List, Optional, Tuple
+from unicodedata import normalize
+
+import nltk
+import spacy
+from dateutil.parser import parse
+from nltk.corpus import stopwords
+from pydantic import BaseModel, PrivateAttr, Field
+from spacy import Language # type: ignore
+from spacy.cli import download # type: ignore
+
+cleaner_func_logger: logging.Logger = logging.getLogger(__name__)
+
+
+class TextCleaningFunction(BaseModel):
+ @abstractmethod
+ def execute(self, tokens: List[str], **kwargs: Any) -> List[str]:
+ pass
+
+
+class ToLowerCase(TextCleaningFunction):
+ def execute(self, tokens: List[str], **kwargs: Any) -> List[str]:
+ return [token.lower() for token in tokens]
+
+
+class RemoveWhiteSpaceAndEmptyToken(TextCleaningFunction):
+ def execute(self, tokens: List[str], **kwargs: Any) -> List[str]:
+ striped_tokens = [token.strip() for token in tokens]
+ return [token for token in striped_tokens if token != ""]
+
+
+# Removes words that don't add any meaning to the sequence
+class RemoveStopWords(TextCleaningFunction):
+ stop_words: Optional[List[str]] = None
+ language: Optional[str] = "english"
+
+ def __init__(self, **data: Any):
+ super().__init__(**data)
+ if not self.stop_words:
+ try:
+ nltk.data.find("stopwords")
+ except LookupError:
+ nltk.download("stopwords")
+ self.stop_words = stopwords.words(self.language)
+
+ def execute(self, tokens: List[str], **kwargs: Any) -> List[str]:
+ if not self.stop_words:
+ return tokens
+ return [token for token in tokens if token not in self.stop_words]
+
+
+class RemovePunctuation(TextCleaningFunction):
+ def execute(self, tokens: List[str], **kwargs: Any) -> List[str]:
+ return [
+ token.translate(token.maketrans("", "", string.punctuation))
+ for token in tokens
+ if len(token.translate(token.maketrans("", "", string.punctuation)))
+ ]
+
+
+# Transforms tokens to standardized form
+class TokenStemming(TextCleaningFunction):
+ stemmer: Optional[Any] = None
+
+ def __init__(self, **data: Any):
+ super().__init__(**data)
+ if not self.stemmer:
+ try:
+ from nltk.stem import PorterStemmer
+
+ self.stemmer = PorterStemmer()
+ except ImportError:
+ cleaner_func_logger.warning(
+ "NLTK module is not installed hence token stemming will not work"
+ )
+
+ def execute(self, tokens: List[str], **kwargs: Any) -> List[str]:
+ if not self.stemmer:
+ return tokens
+ return [self.stemmer.stem(token) for token in tokens]
+
+
+class RemoveSpecialChars(TextCleaningFunction):
+ """
+ Removes special characters by eliminating all characters from each token
+ and only retains alphabetic, numeric or alphanumeric tokens by stripping
+ special characters from them
+ """
+
+ def execute(self, tokens: List[str], **kwargs: Any) -> List[str]:
+ cleaned_tokens = [re.sub("[^A-Za-z0-9]+", "", token) for token in tokens]
+ return [token for token in cleaned_tokens if token != ""]
+
+
+# Converts unicodes to ASCII characters
+class DecodeUnicode(TextCleaningFunction):
+ def execute(self, tokens: List[str], **kwargs: Any) -> List[str]:
+ return [
+ normalize("NFKD", token).encode("ascii", "ignore").decode("utf-8")
+ for token in tokens
+ ]
+
+
+class RemoveDateTime(TextCleaningFunction):
+ _white_space_cleaner = RemoveWhiteSpaceAndEmptyToken()
+
+ def execute(self, tokens: List[str], **kwargs: Any) -> List[str]:
+ text: str = " ".join(tokens)
+ try:
+ fuzzy_tokens: Tuple[str]
+ _, fuzzy_tokens = parse(text, fuzzy_with_tokens=True) # type: ignore
+ tokens = " ".join(fuzzy_tokens).split()
+ except ValueError:
+ cleaner_func_logger.warning("Token contain invalid date time format")
+ return self._white_space_cleaner.execute(tokens)
+
+
+# Replaces domain specific keywords
+class ReplaceDomainKeywords(TextCleaningFunction):
+ domain_keywords: Optional[List[Tuple[str, str]]] = None
+
+ def execute(self, tokens: List[str], **kwargs: Any) -> List[str]:
+ # don't do anything when no domain keywords specified
+ if not self.domain_keywords or len(self.domain_keywords) == 0:
+ return tokens
+
+ text: str = " ".join(tokens)
+ for source_keyword, target_keyword in self.domain_keywords:
+ if source_keyword in text or source_keyword.lower() in text:
+ text = text.replace(source_keyword, target_keyword)
+ tokens = text.split()
+ return tokens
+
+
+class RegExSubstitute(TextCleaningFunction):
+ pattern: Optional[str] = None
+ substitute: Optional[str] = None
+
+ def execute(self, tokens: List[str], **kwargs: Any) -> List[str]:
+ if not self.pattern or not self.substitute:
+ return tokens
+
+ compiled_regex = re.compile(self.pattern)
+
+ return [compiled_regex.sub(self.substitute, token) for token in tokens]
+
+
+class SpacyLemmatization(TextCleaningFunction):
+ _nlp: Language = PrivateAttr()
+ model_name_or_path: str = Field("en_core_web_sm")
+ batch_size: int = 4
+ n_process: int = 1
+
+ def __init__(self, **data: Any):
+ super().__init__(**data)
+ try:
+ self._nlp = spacy.load(
+ self.model_name_or_path,
+ disable=["parser", "ner"],
+ )
+ except:
+ download(self.model_name_or_path)
+ self._nlp = spacy.load(
+ self.model_name_or_path,
+ disable=["parser", "ner"],
+ )
+
+ def execute(self, tokens: List[str], **kwargs: Any) -> List[str]:
+ processed_tokens: List[str] = []
+ for doc in self._nlp.pipe(texts=tokens, batch_size=self.batch_size, n_process=self.n_process):
+ processed_tokens.append(" ".join([token.lemma_ for token in doc]))
+ return processed_tokens
diff --git a/obsei_module/obsei-master/obsei/preprocessor/text_splitter.py b/obsei_module/obsei-master/obsei/preprocessor/text_splitter.py
new file mode 100644
index 0000000000000000000000000000000000000000..9124d367567bcbbef8149965afc2a750c16664f2
--- /dev/null
+++ b/obsei_module/obsei-master/obsei/preprocessor/text_splitter.py
@@ -0,0 +1,129 @@
+import logging
+from typing import List, Optional, Any
+import uuid
+
+import nltk
+from nltk import sent_tokenize
+from pydantic import BaseModel
+
+from obsei.payload import TextPayload
+from obsei.preprocessor.base_preprocessor import (
+ BaseTextPreprocessor,
+ BaseTextProcessorConfig,
+)
+
+logger = logging.getLogger(__name__)
+
+
+class TextSplitterPayload(BaseModel):
+ phrase: str
+ chunk_id: int
+ chunk_length: int
+ document_id: str
+ total_chunks: Optional[int] = None
+
+
+class TextSplitterConfig(BaseTextProcessorConfig):
+ max_split_length: int = 512
+ split_stride: int = 0 # overlap length
+ document_id_key: Optional[str] = None # document_id in meta
+ enable_sentence_split: bool = False
+ honor_paragraph_boundary: bool = False
+ paragraph_marker: str = '\n\n'
+ sentence_tokenizer: str = 'tokenizers/punkt/PY3/english.pickle'
+
+ def __init__(self, **data: Any):
+ super().__init__(**data)
+
+ if self.enable_sentence_split:
+ nltk.download('punkt')
+
+
+class TextSplitter(BaseTextPreprocessor):
+ def preprocess_input( # type: ignore[override]
+ self, input_list: List[TextPayload], config: TextSplitterConfig, **kwargs: Any
+ ) -> List[TextPayload]:
+ text_splits: List[TextPayload] = []
+
+ for idx, input_data in enumerate(input_list):
+ if (
+ config.document_id_key
+ and input_data.meta
+ and config.document_id_key in input_data.meta
+ ):
+ document_id = str(input_data.meta.get(config.document_id_key))
+ else:
+ document_id = uuid.uuid4().hex
+
+ if config.honor_paragraph_boundary:
+ paragraphs = input_data.processed_text.split(config.paragraph_marker)
+ else:
+ paragraphs = [input_data.processed_text]
+
+ atomic_texts: List[str] = []
+ for paragraph in paragraphs:
+ if config.enable_sentence_split:
+ atomic_texts.extend(sent_tokenize(paragraph))
+ else:
+ atomic_texts.append(paragraph)
+
+ split_id = 0
+ document_splits: List[TextSplitterPayload] = []
+ for text in atomic_texts:
+ text_length = len(text)
+ if text_length == 0:
+ continue
+
+ start_idx = 0
+ while start_idx < text_length:
+ if config.split_stride > 0 and start_idx > 0:
+ start_idx = (
+ self._valid_index(
+ text, start_idx - config.split_stride
+ )
+ + 1
+ )
+ end_idx = self._valid_index(
+ text,
+ min(start_idx + config.max_split_length, text_length),
+ )
+
+ phrase = text[start_idx:end_idx]
+ document_splits.append(
+ TextSplitterPayload(
+ phrase=phrase,
+ chunk_id=split_id,
+ chunk_length=len(phrase),
+ document_id=document_id,
+ )
+ )
+ start_idx = end_idx + 1
+ split_id += 1
+
+ total_splits = len(document_splits)
+ for split in document_splits:
+ split.total_chunks = total_splits
+ payload = TextPayload(
+ processed_text=split.phrase,
+ source_name=input_data.source_name,
+ segmented_data=input_data.segmented_data,
+ meta={**input_data.meta, **{"splitter": split}}
+ if input_data.meta
+ else {"splitter": split},
+ )
+ text_splits.append(payload)
+
+ return text_splits
+
+ @staticmethod
+ def _valid_index(document: str, idx: int) -> int:
+ if idx <= 0:
+ return 0
+ if idx >= len(document):
+ return len(document)
+ new_idx = idx
+ while new_idx > 0:
+ if document[new_idx] in [" ", "\n", "\t"]:
+ break
+ new_idx -= 1
+ return new_idx
diff --git a/obsei_module/obsei-master/obsei/preprocessor/text_tokenizer.py b/obsei_module/obsei-master/obsei/preprocessor/text_tokenizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..5632c43d2ba0f9f01a69de333ea764b6653b11d6
--- /dev/null
+++ b/obsei_module/obsei-master/obsei/preprocessor/text_tokenizer.py
@@ -0,0 +1,29 @@
+import logging
+from abc import abstractmethod
+from typing import Any, List, Optional
+
+import nltk
+from nltk import word_tokenize
+from pydantic import BaseModel
+
+logger = logging.getLogger(__name__)
+
+
+class BaseTextTokenizer(BaseModel):
+ @abstractmethod
+ def tokenize_text(self, text: str) -> List[str]:
+ pass
+
+
+class NLTKTextTokenizer(BaseTextTokenizer):
+ tokenizer_name: Optional[str] = "punkt"
+
+ def __init__(self, **data: Any):
+ super().__init__(**data)
+ try:
+ nltk.data.find(f"tokenizers/{self.tokenizer_name}")
+ except LookupError:
+ nltk.download(f"{self.tokenizer_name}")
+
+ def tokenize_text(self, text: str) -> Any:
+ return word_tokenize(text)
diff --git a/obsei_module/obsei-master/obsei/process_workflow.py b/obsei_module/obsei-master/obsei/process_workflow.py
new file mode 100644
index 0000000000000000000000000000000000000000..cbd74b11ab26d10f1aeb344e0b7f30bfa1b687b1
--- /dev/null
+++ b/obsei_module/obsei-master/obsei/process_workflow.py
@@ -0,0 +1,38 @@
+import logging
+
+from obsei.analyzer.base_analyzer import BaseAnalyzer, BaseAnalyzerConfig
+from obsei.configuration import ObseiConfiguration
+from obsei.sink.base_sink import BaseSink, BaseSinkConfig
+from obsei.source.base_source import BaseSourceConfig, BaseSource
+
+logger = logging.getLogger(__name__)
+
+# Extract config via yaml file using `config_path` and `config_filename`
+obsei_configuration = ObseiConfiguration()
+
+# Initialize objects using configuration
+source_config: BaseSourceConfig = obsei_configuration.initialize_instance("source_config")
+source: BaseSource = obsei_configuration.initialize_instance("source")
+analyzer: BaseAnalyzer = obsei_configuration.initialize_instance("analyzer")
+analyzer_config: BaseAnalyzerConfig = obsei_configuration.initialize_instance("analyzer_config")
+sink_config: BaseSinkConfig = obsei_configuration.initialize_instance("sink_config")
+sink: BaseSink = obsei_configuration.initialize_instance("sink")
+
+# This will fetch information from configured source ie twitter, app store etc
+source_response_list = source.lookup(source_config)
+for idx, source_response in enumerate(source_response_list):
+ logger.info(f"source_response#'{idx}'='{vars(source_response)}'")
+
+# This will execute analyzer (Sentiment, classification etc) on source data with provided analyzer_config
+# Analyzer will it's output to `segmented_data` inside `analyzer_response`
+analyzer_response_list = analyzer.analyze_input(
+ source_response_list=source_response_list,
+ analyzer_config=analyzer_config
+)
+for idx, analyzer_response in enumerate(analyzer_response_list):
+ logger.info(f"source_response#'{idx}'='{vars(analyzer_response)}'")
+
+# This will send analyzed output to configure sink ie Slack, Zendesk etc
+sink_response_list = sink.send_data(analyzer_response_list, sink_config)
+for idx, sink_response in enumerate(sink_response_list):
+ logger.info(f"source_response#'{idx}'='{vars(sink_response)}'")
diff --git a/obsei_module/obsei-master/obsei/processor.py b/obsei_module/obsei-master/obsei/processor.py
new file mode 100644
index 0000000000000000000000000000000000000000..20ccec4b3d85dee3949e83a37c24fe3ed8c7e49e
--- /dev/null
+++ b/obsei_module/obsei-master/obsei/processor.py
@@ -0,0 +1,68 @@
+import logging
+from typing import Optional
+
+from pydantic import BaseModel
+
+from obsei.analyzer.base_analyzer import BaseAnalyzer, BaseAnalyzerConfig
+from obsei.sink.base_sink import BaseSink, BaseSinkConfig
+from obsei.source.base_source import BaseSource, BaseSourceConfig
+from obsei.workflow.workflow import Workflow
+
+logger = logging.getLogger(__name__)
+
+
+class Processor(BaseModel):
+ analyzer: BaseAnalyzer
+ analyzer_config: Optional[BaseAnalyzerConfig] = None
+ source: Optional[BaseSource] = None
+ source_config: Optional[BaseSourceConfig] = None
+ sink: Optional[BaseSink] = None
+ sink_config: Optional[BaseSinkConfig] = None
+
+ def process(
+ self,
+ workflow: Optional[Workflow] = None,
+ source: Optional[BaseSource] = None,
+ source_config: Optional[BaseSourceConfig] = None,
+ sink: Optional[BaseSink] = None,
+ sink_config: Optional[BaseSinkConfig] = None,
+ analyzer: Optional[BaseAnalyzer] = None,
+ analyzer_config: Optional[BaseAnalyzerConfig] = None,
+ ) -> None:
+ source = source or self.source
+ sink = sink or self.sink
+ analyzer = analyzer or self.analyzer
+
+ id: Optional[str] = None
+ if workflow:
+ sink_config = workflow.config.sink_config
+ source_config = workflow.config.source_config
+ analyzer_config = workflow.config.analyzer_config
+ id = workflow.id
+ else:
+ sink_config = sink_config or self.sink_config
+ source_config = source_config or self.source_config
+ analyzer_config = analyzer_config or self.analyzer_config
+
+ if source is None or source_config is None:
+ return
+ if sink is None or sink_config is None:
+ return
+
+ source_response_list = source.lookup(config=source_config, id=id)
+ for idx, source_response in enumerate(source_response_list):
+ logger.info(f"source_response#'{idx}'='{source_response}'")
+
+ analyzer_response_list = analyzer.analyze_input(
+ source_response_list=source_response_list,
+ analyzer_config=analyzer_config,
+ id=id,
+ )
+ for idx, analyzer_response in enumerate(analyzer_response_list):
+ logger.info(f"source_response#'{idx}'='{analyzer_response}'")
+
+ sink_response_list = sink.send_data(
+ analyzer_responses=analyzer_response_list, config=sink_config, id=id
+ )
+ for idx, sink_response in enumerate(sink_response_list):
+ logger.info(f"source_response#'{idx}'='{sink_response}'")
diff --git a/obsei_module/obsei-master/obsei/sink/__init__.py b/obsei_module/obsei-master/obsei/sink/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/obsei_module/obsei-master/obsei/sink/base_sink.py b/obsei_module/obsei-master/obsei/sink/base_sink.py
new file mode 100644
index 0000000000000000000000000000000000000000..a0676fb7db4132085ff54bc9c8d71af8c9b58ada
--- /dev/null
+++ b/obsei_module/obsei-master/obsei/sink/base_sink.py
@@ -0,0 +1,54 @@
+from abc import abstractmethod
+from typing import Any, Dict, List, Optional, Type, TypeVar
+
+from pydantic import Field
+from pydantic_settings import BaseSettings
+
+from obsei.payload import TextPayload
+from obsei.workflow.base_store import BaseStore
+
+
+class Convertor(BaseSettings):
+ def convert(
+ self,
+ analyzer_response: TextPayload,
+ base_payload: Optional[Dict[str, Any]] = None,
+ **kwargs: Any
+ ) -> Dict[str, Any]:
+ base_payload = base_payload or dict()
+ return (
+ {**base_payload, **analyzer_response.to_dict()}
+ if base_payload is not None
+ else analyzer_response.to_dict()
+ )
+
+ class Config:
+ arbitrary_types_allowed = True
+
+
+T = TypeVar('T', bound='BaseSinkConfig')
+
+
+class BaseSinkConfig(BaseSettings):
+ TYPE: str = "Base"
+
+ @classmethod
+ def from_dict(cls: Type[T], config: Dict[str, Any]) -> T: # type: ignore[empty-body]
+ pass
+
+ class Config:
+ arbitrary_types_allowed = True
+
+
+class BaseSink(BaseSettings):
+ convertor: Convertor = Field(Convertor())
+ store: Optional[BaseStore] = None
+
+ @abstractmethod
+ def send_data(
+ self, analyzer_responses: List[TextPayload], config: BaseSinkConfig, **kwargs: Any
+ ) -> Any:
+ pass
+
+ class Config:
+ arbitrary_types_allowed = True
diff --git a/obsei_module/obsei-master/obsei/sink/dailyget_sink.py b/obsei_module/obsei-master/obsei/sink/dailyget_sink.py
new file mode 100644
index 0000000000000000000000000000000000000000..5c501a7a7940b15dcbeeee4d23172f94269ed37d
--- /dev/null
+++ b/obsei_module/obsei-master/obsei/sink/dailyget_sink.py
@@ -0,0 +1,166 @@
+import json
+import logging
+from copy import deepcopy
+from datetime import timezone
+from typing import Any, Dict, List, Optional
+
+import pytz
+import requests
+from dateutil import parser
+
+from obsei.sink.base_sink import Convertor
+from obsei.sink.http_sink import HttpSink, HttpSinkConfig
+from obsei.payload import TextPayload
+from obsei.misc.utils import flatten_dict
+
+logger = logging.getLogger(__name__)
+
+
+TWITTER_URL_PREFIX = "https://twitter.com/"
+IST_TZ = pytz.timezone("Asia/Kolkata")
+
+
+class PayloadConvertor(Convertor):
+ def convert(
+ self,
+ analyzer_response: TextPayload,
+ base_payload: Optional[Dict[str, Any]] = None,
+ **kwargs: Any,
+ ) -> Dict[str, Any]:
+ request_payload = base_payload or {}
+ use_enquiry_api = kwargs.get("use_enquiry_api", False)
+
+ if analyzer_response.source_name != "Twitter":
+ return {**request_payload, **analyzer_response.to_dict()}
+
+ source_information = kwargs["source_information"]
+ partner_id = kwargs["partner_id"]
+
+ user_url = ""
+ positive = 0.0
+ negative = 0.0
+ text = ""
+ tweet_id = None
+ created_at_str = None
+ classification_list: List[str] = []
+
+ flat_dict = flatten_dict(analyzer_response.to_dict())
+ for k, v in flat_dict.items():
+ if "username" in k:
+ user_url = TWITTER_URL_PREFIX + v
+ elif "text" in k:
+ text = str(v).replace("\n", " ")
+ elif "positive" in k:
+ positive = float(v)
+ elif "negative" in k:
+ negative = float(v)
+ elif "meta_id" in k:
+ tweet_id = v
+ elif "created_at" in k:
+ created_at_str = v
+ elif "segmented_data" in k and len(classification_list) < 2:
+ classification_list.append(k.rsplit("_", 1)[1])
+
+ created_at_str_parsed: Optional[str] = None
+ if created_at_str:
+ created_at = parser.isoparse(created_at_str)
+ created_at_str_parsed = (
+ created_at.replace(tzinfo=timezone.utc)
+ .astimezone(tz=IST_TZ)
+ .strftime("%Y-%m-%d %H:%M:%S")
+ )
+
+ tweet_url = f"{user_url}/status/{tweet_id}"
+ # Sentiment rules
+ if negative > 8.0:
+ sentiment = "Strong Negative"
+ elif 0.3 < negative <= 8.0:
+ sentiment = "Negative"
+ elif positive >= 0.8:
+ sentiment = "Strong Positive"
+ elif 0.4 < positive < 0.8:
+ sentiment = "Positive"
+ else:
+ sentiment = "Neutral"
+
+ if use_enquiry_api:
+ enquiry = {
+ "Source": source_information,
+ "FeedbackBy": user_url,
+ "Sentiment": sentiment,
+ "TweetUrl": tweet_url,
+ "FormattedText": text,
+ "PredictedCategories": ",".join(classification_list),
+ }
+
+ if created_at_str_parsed is not None:
+ enquiry["ReportedAt"] = created_at_str_parsed
+
+ kv_str_list = [k + ": " + str(v) for k, v in enquiry.items()]
+ request_payload["enquiryMessage"] = "\n".join(kv_str_list)
+ else:
+ message = {
+ "message": text,
+ "partnerId": partner_id,
+ "query": source_information,
+ "source": analyzer_response.source_name,
+ "url": tweet_url,
+ "userProfile": user_url,
+ "sentiment": sentiment,
+ "predictedCategories": ",".join(classification_list),
+ "metadata": str(json.dumps(analyzer_response.segmented_data, ensure_ascii=False)),
+ "originatedAt": created_at_str,
+ }
+ request_payload["messageDetail"] = str(json.dumps(message, ensure_ascii=False))
+
+ return request_payload
+
+
+class DailyGetSinkConfig(HttpSinkConfig):
+ TYPE: str = "DailyGet"
+ partner_id: str
+ consumer_phone_number: str
+ source_information: str
+ use_enquiry_api: bool = False
+ headers: Dict[str, Any] = {"Content-type": "application/json"}
+
+
+class DailyGetSink(HttpSink):
+ def __init__(self, convertor: Convertor = PayloadConvertor(), **data: Any):
+ super().__init__(convertor=convertor, **data)
+
+ def send_data( # type: ignore[override]
+ self,
+ analyzer_responses: List[TextPayload],
+ config: DailyGetSinkConfig,
+ **kwargs: Any,
+ ) -> Any:
+ headers = config.headers
+
+ payloads = []
+ responses = []
+ for analyzer_response in analyzer_responses:
+ payloads.append(
+ self.convertor.convert(
+ analyzer_response=analyzer_response,
+ base_payload=dict()
+ if config.base_payload is None
+ else deepcopy(config.base_payload),
+ source_information=config.source_information,
+ use_enquiry_api=config.use_enquiry_api,
+ partner_id=config.partner_id
+ )
+ )
+
+ for payload in payloads:
+ response = requests.post(
+ url=config.url,
+ json=payload,
+ headers=headers,
+ )
+
+ logger.info(f"payload='{payload}'")
+ logger.info(f"response='{response.__dict__}'")
+ responses.append(response)
+
+ return responses
diff --git a/obsei_module/obsei-master/obsei/sink/elasticsearch_sink.py b/obsei_module/obsei-master/obsei/sink/elasticsearch_sink.py
new file mode 100644
index 0000000000000000000000000000000000000000..8ade40ac9957109d6b91912f9f8f9120bbcd3ea6
--- /dev/null
+++ b/obsei_module/obsei-master/obsei/sink/elasticsearch_sink.py
@@ -0,0 +1,101 @@
+from copy import deepcopy
+from typing import Any, Dict, List, Optional, Union
+
+from elasticsearch import Elasticsearch, RequestError
+from elasticsearch.helpers import bulk
+from pydantic import Field, PrivateAttr, SecretStr
+
+from obsei.sink.base_sink import BaseSink, BaseSinkConfig, Convertor
+from obsei.payload import TextPayload
+
+
+class ElasticSearchSinkConfig(BaseSinkConfig):
+ # This is done to avoid exposing member to API response
+ _es_client: Elasticsearch = PrivateAttr()
+ TYPE: str = "Elasticsearch"
+ hosts: Union[str, List[str], None]
+ index_name: str = "es_index"
+ username: SecretStr = Field(SecretStr(""), env="elasticsearch_username")
+ password: SecretStr = Field(SecretStr(""), env="elasticsearch_password")
+ ca_certs: str = Field("")
+ verify_certs: bool = False
+ create_index: bool = True
+ timeout: int = 30
+ custom_mapping: Optional[Dict[str, Any]] = None
+ refresh_type: str = "wait_for"
+ base_payload: Optional[Dict[str, Any]] = None
+
+ def __init__(self, **data: Any):
+ super().__init__(**data)
+ self._es_client = Elasticsearch(
+ hosts=self.hosts,
+ http_auth=(
+ self.username.get_secret_value(),
+ self.password.get_secret_value(),
+ ),
+ ca_certs=self.ca_certs,
+ verify_certs=self.verify_certs,
+ timeout=self.timeout,
+ )
+ self.base_payload = self.base_payload or {
+ "_op_type": "create", # TODO update exiting support?
+ "_index": self.index_name,
+ }
+ if self.create_index:
+ self._create_index(self.index_name)
+
+ def _create_index(self, index_name: str) -> None:
+ if self.custom_mapping:
+ mapping = self.custom_mapping
+ else:
+ mapping = {
+ "mappings": {
+ "dynamic_templates": [
+ {
+ "strings": {
+ "path_match": "*",
+ "match_mapping_type": "string",
+ "mapping": {"type": "keyword"},
+ }
+ }
+ ],
+ }
+ }
+
+ try:
+ self._es_client.indices.create(index=index_name, mappings=mapping)
+ except RequestError as e:
+ # With multiple workers we need to avoid race conditions, where:
+ # - there's no index in the beginning
+ # - both want to create one
+ # - one fails as the other one already created it
+ if not self._es_client.indices.exists(index=index_name):
+ raise e
+
+ def bulk(self, payloads: List[Dict[str, Any]]) -> Any:
+ return bulk(
+ self._es_client, payloads, request_timeout=300, refresh=self.refresh_type
+ )
+
+
+class ElasticSearchSink(BaseSink):
+ def __init__(self, convertor: Convertor = Convertor(), **data: Any):
+ super().__init__(convertor=convertor, **data)
+
+ def send_data( # type: ignore[override]
+ self,
+ analyzer_responses: List[TextPayload],
+ config: ElasticSearchSinkConfig,
+ **kwargs: Any
+ ) -> Any:
+
+ payloads = []
+ for analyzer_response in analyzer_responses:
+ payloads.append(
+ self.convertor.convert(
+ analyzer_response=analyzer_response,
+ base_payload=deepcopy(config.base_payload),
+ )
+ )
+
+ return config.bulk(payloads)
diff --git a/obsei_module/obsei-master/obsei/sink/http_sink.py b/obsei_module/obsei-master/obsei/sink/http_sink.py
new file mode 100644
index 0000000000000000000000000000000000000000..6d07845608180ac934e82cbdbb43dc747b7ce6ad
--- /dev/null
+++ b/obsei_module/obsei-master/obsei/sink/http_sink.py
@@ -0,0 +1,49 @@
+from copy import deepcopy
+from typing import Any, Dict, List, Optional
+from urllib.request import Request, urlopen
+
+from obsei.misc.utils import obj_to_json
+from obsei.sink.base_sink import BaseSink, BaseSinkConfig, Convertor
+from obsei.payload import TextPayload
+
+DEFAULT_HEADERS = {"Content-type": "application/json"}
+
+
+class HttpSinkConfig(BaseSinkConfig):
+ TYPE: str = "Http"
+ url: str
+ headers: Optional[Dict[str, Any]] = None
+ base_payload: Optional[Dict[str, Any]] = None
+ # analyzer_output to payload mapping
+ payload_mapping: Optional[Dict[str, List[str]]] = None
+ field_conversion: Optional[Dict[str, str]] = None
+
+
+class HttpSink(BaseSink):
+ def __init__(self, convertor: Convertor = Convertor(), **data: Any):
+ super().__init__(convertor=convertor, **data)
+
+ def send_data( # type: ignore[override]
+ self, analyzer_responses: List[TextPayload], config: HttpSinkConfig, **kwargs: Any
+ ) -> Any:
+
+ headers = config.headers or DEFAULT_HEADERS
+
+ payloads = []
+ responses = []
+ for analyzer_response in analyzer_responses:
+ payloads.append(
+ self.convertor.convert(
+ analyzer_response=analyzer_response,
+ base_payload=dict()
+ if config.base_payload is None
+ else deepcopy(config.base_payload),
+ )
+ )
+
+ for payload in payloads:
+ req = Request(config.url, data=obj_to_json(payload), headers=headers)
+ response = urlopen(req)
+ responses.append(response)
+
+ return responses
diff --git a/obsei_module/obsei-master/obsei/sink/jira_sink.py b/obsei_module/obsei-master/obsei/sink/jira_sink.py
new file mode 100644
index 0000000000000000000000000000000000000000..a6a8bef3c51c01024750354ef185c87b0802e49c
--- /dev/null
+++ b/obsei_module/obsei-master/obsei/sink/jira_sink.py
@@ -0,0 +1,106 @@
+import logging
+import textwrap
+from typing import Any, Dict, List, Optional
+
+from atlassian import Jira
+from pydantic import Field, PrivateAttr, SecretStr
+
+from obsei.sink.base_sink import BaseSink, BaseSinkConfig, Convertor
+from obsei.payload import TextPayload
+from obsei.misc.utils import obj_to_markdown
+
+logger = logging.getLogger(__name__)
+
+
+class JiraPayloadConvertor(Convertor):
+ def convert(
+ self,
+ analyzer_response: TextPayload,
+ base_payload: Optional[Dict[str, Any]] = None,
+ **kwargs: Any,
+ ) -> Dict[str, Any]:
+ summary_max_length = kwargs.get("summary_max_length", 50)
+
+ payload = base_payload or dict()
+ payload["description"] = obj_to_markdown(
+ obj=analyzer_response,
+ str_enclose_start="{quote}",
+ str_enclose_end="{quote}",
+ )
+ payload["summary"] = textwrap.shorten(
+ text=analyzer_response.processed_text, width=summary_max_length
+ )
+
+ # TODO: Find correct payload to update labels fields
+ labels_count = kwargs.get("labels_count", 1)
+ # labels = [v for k, v in sorted(analyzer_response.segmented_data.items(), key=lambda item: item[1])]
+ # payload['labels'] = [{"name": label} for label in labels[:labels_count]]
+
+ return payload
+
+
+class JiraSinkConfig(BaseSinkConfig):
+ # This is done to avoid exposing member to API response
+ _jira_client: Jira = PrivateAttr()
+ TYPE: str = "Jira"
+ url: str
+ username: Optional[SecretStr] = Field(None, env="jira_username")
+ password: Optional[SecretStr] = Field(None, env="jira_password")
+ issue_type: Dict[str, str]
+ project: Dict[str, str]
+ update_history: bool = True
+ verify_ssl: bool = False
+ summary_max_length: int = 50
+ labels_count: int = 2 # Number of labels to fetch
+
+ def __init__(self, **data: Any):
+ super().__init__(**data)
+ if self.username is None or self.password is None:
+ raise AttributeError(
+ "Jira informer need username and password"
+ )
+
+ self._jira_client = Jira(
+ url=self.url,
+ username=self.username.get_secret_value(),
+ password=self.password.get_secret_value(),
+ verify_ssl=self.verify_ssl,
+ )
+
+ def get_jira_client(self) -> Jira:
+ return self._jira_client
+
+
+class JiraSink(BaseSink):
+ def __init__(self, convertor: Convertor = JiraPayloadConvertor(), **data: Any):
+ super().__init__(convertor=convertor, **data)
+
+ def send_data( # type: ignore[override]
+ self,
+ analyzer_responses: List[TextPayload],
+ config: JiraSinkConfig,
+ **kwargs: Any,
+ ) -> Any:
+ responses = []
+ payloads = []
+ for analyzer_response in analyzer_responses:
+ payloads.append(
+ self.convertor.convert(
+ analyzer_response=analyzer_response,
+ base_payload={
+ "project": config.project,
+ "issuetype": config.issue_type,
+ },
+ summary_max_length=config.summary_max_length,
+ labels_count=config.labels_count,
+ )
+ )
+
+ for payload in payloads:
+ response = config.get_jira_client().create_issue(
+ fields=payload, update_history=config.update_history
+ )
+ logger.info(f"response='{response}'")
+ responses.append(response)
+
+ return responses
diff --git a/obsei_module/obsei-master/obsei/sink/logger_sink.py b/obsei_module/obsei-master/obsei/sink/logger_sink.py
new file mode 100644
index 0000000000000000000000000000000000000000..970da21b61d3ee06221f128b5972ede0a323d1fb
--- /dev/null
+++ b/obsei_module/obsei-master/obsei/sink/logger_sink.py
@@ -0,0 +1,39 @@
+import logging
+from logging import Logger
+from typing import Any, List, Optional
+
+from pydantic import Field
+
+from obsei.payload import TextPayload
+from obsei.sink.base_sink import BaseSink, BaseSinkConfig, Convertor
+
+
+class LoggerSinkConfig(BaseSinkConfig):
+ TYPE: str = "Logging"
+ logger: Logger = Field(logging.getLogger(__name__))
+ level: int = Field(logging.INFO)
+
+
+class LoggerSink(BaseSink):
+ TYPE: str = "Logging"
+
+ def __init__(self, convertor: Convertor = Convertor(), **data: Any):
+ super().__init__(convertor=convertor, **data)
+
+ def send_data( # type: ignore[override]
+ self,
+ analyzer_responses: List[TextPayload],
+ config: LoggerSinkConfig,
+ **kwargs: Any,
+ ) -> Any:
+ converted_responses = []
+ for analyzer_response in analyzer_responses:
+ converted_responses.append(
+ self.convertor.convert(analyzer_response=analyzer_response)
+ )
+
+ for response in converted_responses:
+ dict_to_print = (
+ vars(response) if hasattr(response, "__dict__") else response
+ )
+ config.logger.log(level=config.level, msg=f"{dict_to_print}")
diff --git a/obsei_module/obsei-master/obsei/sink/pandas_sink.py b/obsei_module/obsei-master/obsei/sink/pandas_sink.py
new file mode 100644
index 0000000000000000000000000000000000000000..98f430faf95d2129c515f5e4d51de6cdd74971c8
--- /dev/null
+++ b/obsei_module/obsei-master/obsei/sink/pandas_sink.py
@@ -0,0 +1,65 @@
+from typing import Any, Dict, List, Optional
+
+from pandas import DataFrame
+
+from obsei.payload import TextPayload
+from obsei.misc.utils import flatten_dict
+from obsei.sink.base_sink import BaseSink, BaseSinkConfig, Convertor
+
+
+class PandasConvertor(Convertor):
+ def convert(
+ self,
+ analyzer_response: TextPayload,
+ base_payload: Optional[Dict[str, Any]] = None,
+ **kwargs: Any,
+ ) -> Dict[str, Any]:
+ base_payload = base_payload or {}
+ merged_dict = {**base_payload, **analyzer_response.to_dict()}
+ return flatten_dict(merged_dict)
+
+
+class PandasSinkConfig(BaseSinkConfig):
+ TYPE: str = "Pandas"
+ dataframe: Optional[DataFrame] = None
+ # By default it will include all the columns
+ include_columns_list: Optional[List[str]] = None
+
+ def __init__(self, **data: Any):
+ super().__init__(**data)
+
+ if self.dataframe is None:
+ self.dataframe = DataFrame()
+
+
+class PandasSink(BaseSink):
+ TYPE: str = "Pandas"
+
+ def __init__(self, convertor: Convertor = PandasConvertor(), **data: Any):
+ super().__init__(convertor=convertor, **data)
+
+ def send_data( # type: ignore[override]
+ self,
+ analyzer_responses: List[TextPayload],
+ config: PandasSinkConfig,
+ **kwargs: Any,
+ ) -> Any:
+ responses = []
+ for analyzer_response in analyzer_responses:
+ converted_response = self.convertor.convert(
+ analyzer_response=analyzer_response
+ )
+ response: Optional[Dict[str, Any]] = None
+ if config.include_columns_list:
+ response = dict()
+ for k, v in converted_response.items():
+ if k in config.include_columns_list:
+ response[k] = v
+ else:
+ response = converted_response
+ responses.append(response)
+
+ if config.dataframe is not None:
+ config.dataframe = config.dataframe.append(responses)
+
+ return config.dataframe
diff --git a/obsei_module/obsei-master/obsei/sink/slack_sink.py b/obsei_module/obsei-master/obsei/sink/slack_sink.py
new file mode 100644
index 0000000000000000000000000000000000000000..ad613c9f4c21cd42013259d74f07bcb580ae1ced
--- /dev/null
+++ b/obsei_module/obsei-master/obsei/sink/slack_sink.py
@@ -0,0 +1,71 @@
+import json
+import logging
+from typing import Any, List, Optional
+
+from jinja2 import Template
+from pydantic import Field, PrivateAttr, SecretStr
+from slack_sdk import WebClient
+
+from obsei.sink.base_sink import BaseSink, BaseSinkConfig
+from obsei.payload import TextPayload
+
+logger = logging.getLogger(__name__)
+
+
+class SlackSinkConfig(BaseSinkConfig):
+ # This is done to avoid exposing member to API response
+ _slack_client: WebClient = PrivateAttr()
+ TYPE: str = "Slack"
+
+ slack_token: Optional[SecretStr] = Field(None, env="slack_token")
+ channel_id: str = Field("", env="slack_channel_id")
+ jinja_template: Optional[str] = None
+ icon_url: str = "https://raw.githubusercontent.com/obsei/obsei-resources/master/logos/obsei_200x200.png"
+ is_markdown: bool = True
+
+ def __init__(self, **data: Any):
+ super().__init__(**data)
+ if self.slack_token is None or self.channel_id == '':
+ raise AttributeError(
+ "Slack informer need slack_token and channel_id"
+ )
+
+ self._slack_client = WebClient(token=self.slack_token.get_secret_value())
+
+ def get_slack_client(self) -> WebClient:
+ return self._slack_client
+
+
+class SlackSink(BaseSink):
+ def __init__(self, **data: Any):
+ super().__init__(**data)
+
+ def send_data( # type: ignore[override]
+ self,
+ analyzer_responses: List[TextPayload],
+ config: SlackSinkConfig,
+ **kwargs: Any,
+ ) -> Any:
+ responses = []
+ payloads = []
+ for analyzer_response in analyzer_responses:
+ payloads.append(self.convertor.convert(analyzer_response=analyzer_response))
+
+ for payload in payloads:
+ if config.jinja_template is not None:
+ template = Template(config.jinja_template)
+ message = template.render(payload=payload)
+ else:
+ message = f'Message: `{str(payload["processed_text"])}` '
+ f'```{json.dumps(payload["segmented_data"], indent=2, ensure_ascii=False)}```'
+
+ response = config.get_slack_client().chat_postMessage(
+ channel=config.channel_id,
+ text=message,
+ icon_url=config.icon_url,
+ mrkdwn=config.is_markdown,
+ )
+ logger.info(f"response='{response}'")
+ responses.append(response)
+
+ return responses
diff --git a/obsei_module/obsei-master/obsei/sink/zendesk_sink.py b/obsei_module/obsei-master/obsei/sink/zendesk_sink.py
new file mode 100644
index 0000000000000000000000000000000000000000..2992b6139fa75f83b58925127a1472aeb8b9cdd5
--- /dev/null
+++ b/obsei_module/obsei-master/obsei/sink/zendesk_sink.py
@@ -0,0 +1,148 @@
+import json
+import logging
+import textwrap
+from copy import deepcopy
+
+import requests
+from typing import Any, Dict, List, Mapping, Optional
+
+from pydantic import BaseModel, Field, SecretStr
+
+from obsei.sink.base_sink import BaseSink, BaseSinkConfig, Convertor
+from obsei.payload import TextPayload
+from obsei.misc.utils import obj_to_markdown
+
+logger = logging.getLogger(__name__)
+
+
+class ZendeskPayloadConvertor(Convertor):
+ # Refer https://developer.zendesk.com/api-reference/ticketing/tickets/tickets/#create-ticket
+ # for the payload details
+ def convert(
+ self,
+ analyzer_response: TextPayload,
+ base_payload: Optional[Dict[str, Any]] = None,
+ **kwargs: Any,
+ ) -> Dict[str, Any]:
+ summary_max_length = kwargs.get("summary_max_length", 50)
+
+ payload = base_payload or dict()
+
+ if "ticket" not in payload:
+ payload["ticket"] = dict()
+
+ if "comment" not in payload["ticket"]:
+ payload["ticket"]["comment"] = dict()
+
+ # For non-html content, use "body" key
+ payload["html_body"] = obj_to_markdown(
+ obj=analyzer_response,
+ str_enclose_start="{quote}",
+ str_enclose_end="{quote}",
+ )
+
+ payload["subject"] = textwrap.shorten(
+ text=analyzer_response.processed_text, width=summary_max_length
+ )
+
+ if analyzer_response.segmented_data is not None and isinstance(
+ analyzer_response.segmented_data, Mapping
+ ):
+ labels_count = kwargs.get("labels_count", 1)
+ labels = [
+ str(v)
+ for k, v in analyzer_response.segmented_data.items()
+ ]
+ payload["tags"] = [label for label in labels[:labels_count]]
+
+ return payload
+
+
+class ZendeskCredInfo(BaseModel):
+ email: Optional[str] = Field(None, env="zendesk_email")
+ password: Optional[SecretStr] = Field(None, env="zendesk_password")
+ oauth_token: Optional[SecretStr] = Field(None, env="zendesk_oauth_token")
+ token: Optional[SecretStr] = Field(None, env="zendesk_token")
+
+ def __init__(self, **data: Any):
+ super().__init__(**data)
+
+ if not self.oauth_token and not self.token and not self.email and not self.password:
+ raise ValueError("At least one credential is required")
+
+ if self.password and self.token:
+ raise ValueError("Only one of password or token can be provided")
+
+ def get_session(self) -> requests.Session:
+ session = requests.Session()
+
+ if self.oauth_token:
+ session.headers.update({"Authorization": f'Bearer {self.oauth_token.get_secret_value()}'})
+ elif self.email and self.token:
+ session.auth = (f'{self.email}/token', self.token.get_secret_value())
+ elif self.email and self.password:
+ session.auth = (self.email, self.password.get_secret_value())
+
+ return session
+
+
+class ZendeskSinkConfig(BaseSinkConfig):
+ TYPE: str = "Zendesk"
+ ticket_api: str = Field(default="/api/v2/tickets.json")
+ scheme: str = Field(default="https", env="zendesk_scheme")
+ domain: str = Field(default="zendesk.com", env="zendesk_domain")
+ subdomain: Optional[str] = Field(None, env="zendesk_subdomain")
+ cred_info: Optional[ZendeskCredInfo] = Field(None)
+ summary_max_length: int = 50
+ labels_count: int = 3 # Number of labels to fetch
+ base_payload: Optional[Dict[str, Any]] = None
+
+ def __init__(self, **data: Any):
+ super().__init__(**data)
+
+ self.cred_info = self.cred_info or ZendeskCredInfo()
+
+ def get_endpoint(self) -> str:
+ sub_prefix = "" if self.subdomain is None or self.subdomain == '' else f"/{self.subdomain}."
+ return f'{self.scheme}://{sub_prefix}{self.domain}{self.ticket_api}'
+
+
+class ZendeskSink(BaseSink):
+ def __init__(self, convertor: Convertor = ZendeskPayloadConvertor(), **data: Any):
+ super().__init__(convertor=convertor, **data)
+
+ def send_data( # type: ignore[override]
+ self,
+ analyzer_responses: List[TextPayload],
+ config: ZendeskSinkConfig,
+ **kwargs: Any,
+ ) -> Any:
+ responses: List[Any] = []
+ payloads: List[Dict[str, Any]] = []
+
+ if config.cred_info is None:
+ logger.error("Zendesk credentials are not provided")
+ return responses
+
+ for analyzer_response in analyzer_responses:
+ payloads.append(
+ self.convertor.convert(
+ analyzer_response=analyzer_response,
+ base_payload=dict()
+ if config.base_payload is None
+ else deepcopy(config.base_payload),
+ summary_max_length=config.summary_max_length,
+ labels_count=config.labels_count,
+ )
+ )
+
+ for payload in payloads:
+ session = config.cred_info.get_session()
+ response = session.post(
+ config.get_endpoint(),
+ json=json.dumps(payload["segmented_data"], indent=2, ensure_ascii=False)
+ )
+ logger.info(f"response='{response}'")
+ responses.append(response)
+
+ return responses
diff --git a/obsei_module/obsei-master/obsei/source/__init__.py b/obsei_module/obsei-master/obsei/source/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/obsei_module/obsei-master/obsei/source/__pycache__/website_crawler_source.cpython-311.pyc b/obsei_module/obsei-master/obsei/source/__pycache__/website_crawler_source.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e2d6642cbe8b71fd8a91a71c7fcc9df589961c96
Binary files /dev/null and b/obsei_module/obsei-master/obsei/source/__pycache__/website_crawler_source.cpython-311.pyc differ
diff --git a/obsei_module/obsei-master/obsei/source/appstore_scrapper.py b/obsei_module/obsei-master/obsei/source/appstore_scrapper.py
new file mode 100644
index 0000000000000000000000000000000000000000..f180aa23c4de4de307c85a54418ff5334d7be2c8
--- /dev/null
+++ b/obsei_module/obsei-master/obsei/source/appstore_scrapper.py
@@ -0,0 +1,150 @@
+import logging
+import re
+from datetime import datetime, timezone
+from typing import Any, Dict, List, Optional, Tuple
+from urllib import parse
+
+from app_store.app_store_reviews_reader import AppStoreReviewsReader
+
+from obsei.misc.web_search import perform_search
+from obsei.source.base_source import BaseSource, BaseSourceConfig
+from obsei.payload import TextPayload
+from obsei.misc.utils import (
+ DATETIME_STRING_PATTERN,
+ DEFAULT_LOOKUP_PERIOD,
+ convert_utc_time,
+)
+
+logger = logging.getLogger(__name__)
+
+
+class AppStoreScrapperConfig(BaseSourceConfig):
+ TYPE: str = "AppStoreScrapper"
+ app_url: Optional[str] = None
+ countries: Optional[List[str]] = None
+ app_id: Optional[str] = None
+ app_name: Optional[str] = None
+ lookup_period: Optional[str] = None
+ max_count: Optional[int] = None
+
+ def __init__(self, **data: Any):
+ super().__init__(**data)
+
+ if self.app_url is not None:
+ self.app_id, self.countries, self.app_name = AppStoreScrapperConfig.parse_app_url(self.app_url)
+ else:
+ if not self.app_id and self.app_name:
+ self.app_id = AppStoreScrapperConfig.search_id(self.app_name)
+
+ if not self.app_id:
+ raise ValueError("Valid `package_name`, `app_name` or `app_url` is mandatory")
+
+ self.countries = self.countries or ["us"]
+ self.app_name = self.app_name or self.app_id
+
+ @classmethod
+ def parse_app_url(cls, app_url: str) -> Tuple[Optional[str], Optional[List[str]], Optional[str]]:
+ parsed_url = parse.urlparse(app_url)
+ url_paths = parsed_url.path.split("/")
+
+ countries = app_name = app_id = None
+ if len(url_paths) == 5:
+ countries = [url_paths[1]]
+ app_name = url_paths[3]
+ app_ids = url_paths[4].split("id")
+ app_id = None if len(app_ids) != 2 else app_ids[1]
+
+ return app_id, countries, app_name
+
+ # Code is influenced from https://github.com/cowboy-bebug/app-store-scraper
+ @classmethod
+ def search_id(cls, app_name: str, store: str = "app") -> str:
+ if store == "app":
+ landing_url = "apps.apple.com"
+ request_host = "amp-api.apps.apple.com"
+ else:
+ landing_url = "podcasts.apple.com"
+ request_host = "amp-api.podcasts.apple.com"
+
+ base_request_url = f"https://{request_host}"
+ search_response = perform_search(
+ request_url=base_request_url, query=f"app store {app_name}"
+ )
+
+ pattern = fr"{landing_url}/[a-z]{{2}}/.+?/id([0-9]+)"
+ match_object = re.search(pattern, search_response.text)
+ if match_object:
+ app_id = str(match_object.group(1))
+ else:
+ raise RuntimeError("Pattern matching is not found")
+ return app_id
+
+
+class AppStoreScrapperSource(BaseSource):
+ NAME: Optional[str] = "AppStoreScrapper"
+
+ def lookup(self, config: AppStoreScrapperConfig, **kwargs: Any) -> List[TextPayload]: # type: ignore[override]
+ source_responses: List[TextPayload] = []
+
+ # Get data from state
+ identifier: str = kwargs.get("id", None)
+ state: Optional[Dict[str, Any]] = (
+ None
+ if identifier is None or self.store is None
+ else self.store.get_source_state(identifier)
+ )
+ update_state: bool = True if identifier else False
+ state = state or dict()
+
+ if config.countries is None or len(config.countries) == 0:
+ logger.warning("`countries` in config should not be empty or None")
+ return source_responses
+
+ for country in config.countries:
+ country_stat: Dict[str, Any] = state.get(country, dict())
+ lookup_period: str = country_stat.get("since_time", config.lookup_period)
+ lookup_period = lookup_period or DEFAULT_LOOKUP_PERIOD
+ if len(lookup_period) <= 5:
+ since_time = convert_utc_time(lookup_period)
+ else:
+ since_time = datetime.strptime(lookup_period, DATETIME_STRING_PATTERN)
+ since_time = since_time.replace(tzinfo=timezone.utc)
+
+ last_since_time: datetime = since_time
+
+ since_id: Optional[int] = country_stat.get("since_id", None)
+ last_index = since_id
+ state[country] = country_stat
+
+ scrapper = AppStoreReviewsReader(country=country, app_id=config.app_id)
+ reviews = scrapper.fetch_reviews(after=since_time, since_id=since_id)
+ reviews = reviews or []
+ if config.max_count is not None and config.max_count < len(reviews):
+ reviews = reviews[: config.max_count]
+
+ for review in reviews:
+ source_responses.append(
+ TextPayload(
+ processed_text=f"{review.title}. {review.content}",
+ meta=vars(review) if hasattr(review, "__dict__") else review,
+ source_name=self.NAME,
+ )
+ )
+
+ review_time = review.date.replace(tzinfo=timezone.utc)
+ if review_time < since_time:
+ break
+ if last_since_time is None or last_since_time < review_time:
+ last_since_time = review_time
+ if last_index is None or last_index < review.id:
+ last_index = review.id
+
+ country_stat["since_time"] = last_since_time.strftime(
+ DATETIME_STRING_PATTERN
+ )
+ country_stat["since_id"] = last_index
+
+ if update_state and self.store is not None:
+ self.store.update_source_state(workflow_id=identifier, state=state)
+
+ return source_responses
diff --git a/obsei_module/obsei-master/obsei/source/base_source.py b/obsei_module/obsei-master/obsei/source/base_source.py
new file mode 100644
index 0000000000000000000000000000000000000000..fd2312cf4ef65a0ef58ad21efda401502a99f1c0
--- /dev/null
+++ b/obsei_module/obsei-master/obsei/source/base_source.py
@@ -0,0 +1,25 @@
+from abc import abstractmethod
+from typing import List, Optional, Any
+
+from pydantic_settings import BaseSettings
+
+from obsei.payload import TextPayload
+from obsei.workflow.base_store import BaseStore
+
+
+class BaseSourceConfig(BaseSettings):
+ TYPE: str = "Base"
+
+ class Config:
+ arbitrary_types_allowed = True
+
+
+class BaseSource(BaseSettings):
+ store: Optional[BaseStore] = None
+
+ @abstractmethod
+ def lookup(self, config: BaseSourceConfig, **kwargs: Any) -> List[TextPayload]:
+ pass
+
+ class Config:
+ arbitrary_types_allowed = True
diff --git a/obsei_module/obsei-master/obsei/source/email_source.py b/obsei_module/obsei-master/obsei/source/email_source.py
new file mode 100644
index 0000000000000000000000000000000000000000..907463b6e41357b60dded8989b13a04e82a4dcb9
--- /dev/null
+++ b/obsei_module/obsei-master/obsei/source/email_source.py
@@ -0,0 +1,288 @@
+import email
+import imaplib
+import logging
+from datetime import datetime
+from email.header import decode_header
+from email.message import Message
+from typing import Any, Dict, List, Optional
+
+import pytz
+from pydantic import Field, PrivateAttr, SecretStr
+from pydantic_settings import BaseSettings
+
+from obsei.payload import TextPayload
+from obsei.misc.utils import (
+ DATETIME_STRING_PATTERN,
+ DEFAULT_LOOKUP_PERIOD,
+ convert_utc_time,
+ text_from_html,
+)
+from obsei.source.base_source import BaseSource, BaseSourceConfig
+
+logger = logging.getLogger(__name__)
+
+
+class EmailCredInfo(BaseSettings):
+ username: Optional[SecretStr] = Field(None, env="email_username")
+ password: Optional[SecretStr] = Field(None, env="email_password")
+
+
+class EmailConfig(BaseSourceConfig):
+ # This is done to avoid exposing member to API response
+ _imap_client: imaplib.IMAP4 = PrivateAttr()
+ TYPE: str = "Email"
+ # List of IMAP servers for most commonly used email providers
+ # https://www.systoolsgroup.com/imap/
+ # Also, if you're using a Gmail account then make sure you allow less secure apps on your account -
+ # https://myaccount.google.com/lesssecureapps?pli=1
+ # Also enable IMAP access -
+ # https://mail.google.com/mail/u/0/#settings/fwdandpop
+ imap_server: str
+ imap_port: Optional[int] = None
+ download_attachments: Optional[bool] = False
+ mailboxes: List[str] = Field(["INBOX"])
+ cred_info: Optional[EmailCredInfo] = Field(None)
+ lookup_period: Optional[str] = None
+
+ def __init__(self, **data: Any):
+ super().__init__(**data)
+
+ self.cred_info = self.cred_info or EmailCredInfo()
+
+ if self.cred_info.password is None or self.cred_info.username is None:
+ raise ValueError("Email account `username` and `password` is required")
+ if self.imap_port:
+ self._imap_client = imaplib.IMAP4_SSL(
+ host=self.imap_server, port=self.imap_port
+ )
+ else:
+ self._imap_client = imaplib.IMAP4_SSL(self.imap_server)
+
+ self._imap_client.login(
+ user=self.cred_info.username.get_secret_value(),
+ password=self.cred_info.password.get_secret_value(),
+ )
+
+ def __del__(self) -> None:
+ # self._imap_client.close()
+ self._imap_client.logout()
+
+ def get_client(self) -> imaplib.IMAP4:
+ return self._imap_client
+
+
+class EmailSource(BaseSource):
+ NAME: str = "Email"
+
+ @staticmethod
+ def clean(text: str) -> str:
+ # clean text for creating a folder
+ return "".join(c if c.isalnum() else "_" for c in text)
+
+ def lookup(self, config: EmailConfig, **kwargs: Any) -> List[TextPayload]: # type: ignore[override]
+ source_responses: List[TextPayload] = []
+
+ # Get data from state
+ id: str = kwargs.get("id", None)
+ state: Optional[Dict[str, Any]] = (
+ None
+ if id is None or self.store is None
+ else self.store.get_source_state(id)
+ )
+ update_state: bool = True if id else False
+ state = state or dict()
+
+ imap_client = config.get_client()
+
+ for mailbox in config.mailboxes:
+ need_more_lookup = True
+
+ status, messages = imap_client.select(mailbox=mailbox, readonly=True)
+ if status != "OK":
+ logger.warning(f"Not able to connect with {mailbox}: {status}")
+ continue
+
+ mailbox_stat: Dict[str, Any] = state.get(mailbox, dict())
+ lookup_period: str = mailbox_stat.get(
+ "since_time", config.lookup_period or DEFAULT_LOOKUP_PERIOD
+ )
+ if len(lookup_period) <= 5:
+ since_time = convert_utc_time(lookup_period)
+ else:
+ since_time = datetime.strptime(lookup_period, DATETIME_STRING_PATTERN)
+
+ if since_time.tzinfo is None:
+ since_time = since_time.replace(tzinfo=pytz.utc)
+ else:
+ since_time = since_time.astimezone(pytz.utc)
+
+ last_since_time: datetime = since_time
+ since_id: Optional[int] = mailbox_stat.get("since_message_id", None)
+ last_index = since_id
+
+ state[mailbox] = mailbox_stat
+
+ num_of_emails = int(str(messages[0]))
+
+ # Read in reverse order means latest emails first
+ # Most of code is borrowed from https://www.thepythoncode.com/article/reading-emails-in-python and
+ # modified to suite here
+ for index in range(num_of_emails, 0, -1):
+ email_meta: Dict[str, Any] = dict()
+
+ # fetch the email message by ID
+ status, email_message = imap_client.fetch(str(index), "(RFC822)")
+
+ email_content: str = ""
+
+ for response in email_message:
+ if isinstance(response, tuple):
+ # parse a bytes email into a message object
+ msg = email.message_from_bytes(response[1])
+
+ email_meta["subject"] = self._parse_email_header(msg, "Subject")
+ email_meta["from_address"] = self._parse_email_header(
+ msg, "From"
+ )
+ email_meta["to_address"] = self._parse_email_header(msg, "To")
+ date_received_str = self._parse_email_header(msg, "Date")
+
+ try:
+ date_received = datetime.strptime(
+ date_received_str, "%a, %d %b %Y %H:%M:%S %Z"
+ )
+ except Exception:
+ try:
+ date_received = datetime.strptime(
+ date_received_str, "%a, %d %b %Y %H:%M:%S %z"
+ )
+ except Exception:
+ date_received = datetime.strptime(
+ date_received_str, "%a, %d %b %Y %H:%M:%S %z (%Z)"
+ )
+
+ if date_received.tzinfo is None:
+ date_received = date_received.replace(tzinfo=pytz.utc)
+ else:
+ date_received = date_received.astimezone(pytz.utc)
+ email_meta["date_received"] = date_received
+ email_meta["message_id"] = self._parse_email_header(
+ msg, "Message-ID"
+ )
+
+ part_id = 0
+ # if the email message is multipart
+ if msg.is_multipart():
+ # iterate over email parts
+ for part in msg.walk():
+ part_id_str = f"part_{part_id}"
+ # extract content type of email
+ content_type = part.get_content_type()
+ content_disposition = str(
+ part.get("Content-Disposition")
+ )
+
+ email_meta[part_id_str] = dict()
+ email_meta[part_id_str]["content_type"] = content_type
+ email_meta[part_id_str][
+ "content_disposition"
+ ] = content_disposition
+
+ if (
+ "attachment" not in content_disposition
+ and "text/" in content_type
+ ):
+ try:
+ # get the email body
+ email_body = part.get_payload(
+ decode=True
+ ).decode()
+ if content_type == "text/html":
+ email_body = text_from_html(email_body)
+ # append email body with existing
+ email_meta[part_id_str][
+ "email_body"
+ ] = email_body
+ email_content = (
+ email_content + "\n" + email_body
+ )
+ except Exception:
+ logger.error("Unable to parse email body")
+ elif "attachment" in content_disposition:
+ logger.warning(
+ "Email attachment download is not supported"
+ )
+ # Download attachment is commented currently
+ # # download attachment
+ # filename = part.get_filename()
+ # if filename:
+ # folder_name = self.clean(subject)
+ # if not os.path.isdir(folder_name):
+ # # make a folder for this email (named after the subject)
+ # os.mkdir(folder_name)
+ # filepath = os.path.join(folder_name, filename)
+ # # download attachment and save it
+ # open(filepath, "wb").write(part.get_payload(decode=True))
+
+ part_id = part_id + 1
+ else:
+ part_id_str = f"part_{part_id}"
+ email_meta[part_id_str] = dict()
+ # extract content type of email
+ content_type = msg.get_content_type()
+ email_meta[part_id_str]["content_type"] = content_type
+
+ # get the email body
+ email_body = msg.get_payload(decode=True).decode()
+ if content_type == "text/html":
+ email_body = text_from_html(email_body)
+
+ email_meta[part_id_str]["email_body"] = email_body
+ email_content = email_content + "\n" + email_body
+
+ if date_received <= since_time:
+ need_more_lookup = False
+ break
+ if last_index and last_index == email_meta["message_id"]:
+ need_more_lookup = False
+ break
+ if last_since_time is None or last_since_time < date_received:
+ last_since_time = date_received
+ if last_index is None:
+ last_index = email_meta["message_id"]
+
+ source_responses.append(
+ TextPayload(
+ processed_text="\n".join(
+ [email_meta.get("subject", ""), email_content]
+ ),
+ meta=email_meta,
+ source_name=self.NAME,
+ )
+ )
+
+ if not need_more_lookup:
+ break
+
+ mailbox_stat["since_time"] = last_since_time.strftime(
+ DATETIME_STRING_PATTERN
+ )
+ mailbox_stat["since_comment_id"] = last_index
+
+ if update_state and self.store is not None:
+ self.store.update_source_state(workflow_id=id, state=state)
+
+ return source_responses
+
+ @staticmethod
+ def _email_cleanup(content: str): # type: ignore[no-untyped-def]
+ # TODO: Implement the method to cleanup email contents
+ pass
+
+ @staticmethod
+ def _parse_email_header(header: Message, key: str) -> str:
+ value, encoding = decode_header(header[key])[0]
+ if isinstance(value, bytes):
+ # if it's a bytes, decode to str
+ return "" if not encoding else value.decode(encoding)
+ return str(value)
diff --git a/obsei_module/obsei-master/obsei/source/facebook_source.py b/obsei_module/obsei-master/obsei/source/facebook_source.py
new file mode 100644
index 0000000000000000000000000000000000000000..84416f95a321358369bb142fd45e0963180c20e3
--- /dev/null
+++ b/obsei_module/obsei-master/obsei/source/facebook_source.py
@@ -0,0 +1,177 @@
+import logging
+from datetime import datetime
+from typing import Any, Dict, List, Optional
+
+from pydantic import Field, PrivateAttr
+from pydantic.types import SecretStr
+from pydantic_settings import BaseSettings
+from pyfacebook import FacebookApi
+
+from obsei.misc.utils import (
+ DATETIME_STRING_PATTERN,
+ DEFAULT_LOOKUP_PERIOD,
+ convert_utc_time,
+ obj_to_json,
+ convert_datetime_str_to_epoch,
+)
+from obsei.payload import TextPayload
+from obsei.source.base_source import BaseSource, BaseSourceConfig
+
+logger = logging.getLogger(__name__)
+
+
+class FacebookCredentials(BaseSettings):
+ app_id: Optional[SecretStr] = Field(None, env="facebook_app_id")
+ app_secret: Optional[SecretStr] = Field(None, env="facebook_app_secret")
+ long_term_token: Optional[SecretStr] = Field(None, env="facebook_long_term_token")
+
+
+class FacebookSourceConfig(BaseSourceConfig):
+ _api_client: FacebookApi = PrivateAttr()
+ TYPE: str = "Facebook"
+ page_id: str
+ post_ids: Optional[List[str]] = None
+ lookup_period: Optional[str] = None
+ max_post: Optional[int] = 50
+ cred_info: Optional[FacebookCredentials] = Field(None)
+
+ def __init__(self, **data: Any):
+ super().__init__(**data)
+
+ self.cred_info = self.cred_info or FacebookCredentials()
+
+ if self.cred_info.long_term_token is not None:
+ application_only_auth = False
+ elif self.cred_info.app_id is not None and self.cred_info.app_secret is not None:
+ application_only_auth = True
+ else:
+ raise AttributeError("`app_id`, `app_secret` and `long_term_token` required to connect to Facebook")
+
+ self._api_client = FacebookApi(
+ app_id=self.cred_info.app_id.get_secret_value() if self.cred_info.app_id else None,
+ app_secret=self.cred_info.app_secret.get_secret_value() if self.cred_info.app_secret else None,
+ access_token=self.cred_info.long_term_token.get_secret_value() if self.cred_info.long_term_token else None,
+ application_only_auth=application_only_auth,
+ )
+
+ def get_client(self) -> FacebookApi:
+ return self._api_client
+
+
+class FacebookSource(BaseSource):
+ NAME: str = "Facebook"
+
+ def lookup(self, config: FacebookSourceConfig, **kwargs: Any) -> List[TextPayload]: # type: ignore[override]
+ source_responses: List[TextPayload] = []
+
+ # Get data from state
+ identifier: str = kwargs.get("id", None)
+ state: Optional[Dict[str, Any]] = (
+ None
+ if identifier is None or self.store is None
+ else self.store.get_source_state(identifier)
+ )
+ update_state: bool = True if identifier else False
+ state = state or dict()
+ since_timestamp: Optional[int] = state.get("since_timestamp", None)
+ if since_timestamp is None:
+ lookup_period = config.lookup_period or DEFAULT_LOOKUP_PERIOD
+ if len(lookup_period) <= 5:
+ since_time = convert_utc_time(lookup_period)
+ else:
+ since_time = datetime.strptime(lookup_period, DATETIME_STRING_PATTERN)
+
+ since_timestamp = int(since_time.timestamp())
+ self.log_object("Since: ", str(datetime.fromtimestamp(since_timestamp)))
+ post_last_since_time = since_timestamp
+
+ api = config.get_client()
+ post_ids = config.post_ids
+ if not post_ids:
+ posts = api.page.get_posts(
+ page_id=config.page_id,
+ count=config.max_post,
+ since_time=str(since_timestamp),
+ return_json=True,
+ )
+ self.log_object("Posts: ", str(posts))
+ post_ids = []
+ for post in posts:
+ post_update_time = convert_datetime_str_to_epoch(post["updated_time"])
+ if post_update_time is not None:
+ if post_update_time < since_timestamp:
+ break
+
+ if (
+ post_last_since_time is None
+ or post_last_since_time < post_update_time
+ ):
+ post_last_since_time = post_update_time
+ else:
+ logger.warning("Unable to parse post update time: {}", post["updated_time"])
+
+ post_ids.append(post["id"])
+
+ for post_id in post_ids:
+ # Collect post state
+ post_stat: Dict[str, Any] = state.get(post_id, dict())
+ state[post_id] = post_stat
+
+ comment_since_time = state.get("since_timestamp", since_timestamp)
+ comment_last_since_time = comment_since_time
+
+ comments, comment_summary = api.page.get_comments(
+ object_id=post_id,
+ filter_type="stream",
+ order_type="reverse_chronological",
+ )
+ self.log_object("Comments: ", str(comments))
+ self.log_object("Comment Summary: ", str(comment_summary))
+
+ for comment in comments:
+ comment_created_time = convert_datetime_str_to_epoch(
+ comment.created_time
+ )
+ if comment_created_time < comment_since_time:
+ break
+
+ if (
+ comment_last_since_time is None
+ or comment_last_since_time < comment_created_time
+ ):
+ comment_last_since_time = comment_created_time
+
+ source_responses.append(
+ TextPayload(
+ processed_text=comment.message,
+ meta=vars(comment),
+ source_name=self.NAME,
+ )
+ )
+
+ post_stat["since_timestamp"] = comment_last_since_time
+
+ state["since_timestamp"] = post_last_since_time
+
+ # TODO: See how to augment with with comments data
+ # if config.include_title_description:
+ # text_payloads = [
+ # TextPayload(
+ # processed_text=f"{data['title']}. {data['description']}",
+ # meta=data,
+ # source_name=self.NAME,
+ # )
+ # for post in posts
+ # for data in post["attachments"]["data"]
+ # ]
+ #
+ # source_responses.extend(text_payloads)
+
+ if update_state and self.store is not None:
+ self.store.update_source_state(workflow_id=identifier, state=state)
+
+ return source_responses
+
+ @staticmethod
+ def log_object(message: str, result: Any) -> None:
+ logger.debug(message + str(obj_to_json(result)))
diff --git a/obsei_module/obsei-master/obsei/source/google_maps_reviews.py b/obsei_module/obsei-master/obsei/source/google_maps_reviews.py
new file mode 100644
index 0000000000000000000000000000000000000000..9054e74105a27183eb0e447424b96d07fc188b70
--- /dev/null
+++ b/obsei_module/obsei-master/obsei/source/google_maps_reviews.py
@@ -0,0 +1,124 @@
+import logging
+from datetime import datetime
+from typing import Optional, List, Any, Dict
+
+import requests
+from pydantic import SecretStr, Field
+
+from obsei.misc.utils import convert_utc_time, DATETIME_STRING_PATTERN
+from obsei.payload import TextPayload
+from obsei.source.base_source import BaseSourceConfig, BaseSource
+
+logger = logging.getLogger(__name__)
+OUTSCRAPPER_API_URL = 'https://api.app.outscraper.com'
+
+
+class OSGoogleMapsReviewsConfig(BaseSourceConfig):
+ NAME: str = "Maps Reviews Scrapper"
+ queries: List[str]
+ sort: str = "newest"
+ ignore_empty_reviews: bool = True
+ language: str = "en"
+ since_timestamp: Optional[int] = None
+ until_timestamp: Optional[int] = None
+ lookup_period: Optional[str] = None
+ number_of_reviews: int = 10
+ number_of_places_per_query: int = 1
+ country: Optional[str] = None
+ filtered_fields: List[str] = Field(['reviews_data'])
+ # parameter defines the coordinates of the location where you want your query to be applied.
+ # It has to be constructed in the next sequence: "@" + "latitude" + "," + "longitude" + "," + "zoom"
+ # (e.g. "@41.3954381,2.1628662,15.1z").
+ central_coordinates: Optional[str] = None
+ # Get API key from https://outscraper.com/
+ api_key: Optional[SecretStr] = Field(None, env="outscrapper_api_key")
+
+ def __init__(self, **values: Any):
+ super().__init__(**values)
+
+ if self.api_key is None:
+ raise ValueError("OutScrapper API key require to fetch reviews data")
+
+
+class OSGoogleMapsReviewsSource(BaseSource):
+ NAME: str = "Maps Reviews Scrapper"
+
+ def lookup(self, config: OSGoogleMapsReviewsConfig, **kwargs: Any) -> List[TextPayload]: # type: ignore[override]
+ source_responses: List[TextPayload] = []
+
+ # Get data from state
+ identifier: str = kwargs.get("id", None)
+
+ state: Optional[Dict[str, Any]] = (
+ None
+ if id is None or self.store is None
+ else self.store.get_source_state(identifier)
+ )
+
+ update_state: bool = True if identifier else False
+ state = state or dict()
+
+ since_timestamp: Optional[int] = (
+ None if state is None else state.get("since_timestamp", None)
+ )
+ since_timestamp = since_timestamp or config.since_timestamp
+ if since_timestamp is None and config.lookup_period is not None:
+ if len(config.lookup_period) <= 5:
+ since_time = convert_utc_time(config.lookup_period)
+ else:
+ since_time = datetime.strptime(config.lookup_period, DATETIME_STRING_PATTERN)
+
+ since_timestamp = int(since_time.timestamp())
+
+ last_reviews_since_time = since_timestamp
+
+ params: Dict[str, Any] = {
+ 'query': config.queries,
+ 'reviewsLimit': config.number_of_reviews,
+ 'limit': config.number_of_places_per_query,
+ 'sort': config.sort,
+ # Reviews are sorted from latest to oldest in case cutoff or start is passed
+ # cutoff is oldest timestamp till reviews are needed
+ 'cutoff': since_timestamp,
+ # start is newest timestamp from reviews are needed
+ 'start': config.until_timestamp,
+ 'ignoreEmpty': config.ignore_empty_reviews,
+ 'coordinates': config.central_coordinates,
+ 'language': config.language,
+ 'region': config.country,
+ 'fields': ",".join(config.filtered_fields),
+ 'async': False,
+ }
+
+ # For API doc refer https://app.outscraper.com/api-docs#tag/Google-Reviews
+ response = requests.get(f'{OUTSCRAPPER_API_URL}/maps/reviews-v3', params=params, headers={
+ 'X-API-KEY': "" if config.api_key is None else config.api_key.get_secret_value(),
+ })
+
+ queries_data = []
+ if response.status_code == 200:
+ queries_data = response.json().get('data', [])
+ else:
+ logger.warning(f"API call failed with error: {response.json()}")
+
+ for query_data in queries_data:
+ reviews = [] if "reviews_data" not in query_data else query_data.pop("reviews_data")
+
+ for review in reviews:
+ source_responses.append(
+ TextPayload(
+ processed_text=review["review_text"],
+ meta={**review, **query_data},
+ source_name=self.NAME,
+ )
+ )
+ review_time = review["review_timestamp"]
+
+ if last_reviews_since_time is None or last_reviews_since_time < review_time:
+ last_reviews_since_time = review_time
+
+ state["since_timestamp"] = last_reviews_since_time
+ if update_state and self.store is not None:
+ self.store.update_source_state(workflow_id=identifier, state=state)
+
+ return source_responses
diff --git a/obsei_module/obsei-master/obsei/source/google_news_source.py b/obsei_module/obsei-master/obsei/source/google_news_source.py
new file mode 100644
index 0000000000000000000000000000000000000000..f1fcf985fdc4b3704f69269274fcd9b319390394
--- /dev/null
+++ b/obsei_module/obsei-master/obsei/source/google_news_source.py
@@ -0,0 +1,160 @@
+from typing import Any, Dict, List, Optional
+from urllib import parse
+
+import dateparser
+from GoogleNews import GoogleNews
+from pydantic import PrivateAttr
+from datetime import datetime, date, timedelta, time, timezone
+
+from obsei.payload import TextPayload
+from obsei.misc.utils import DATETIME_STRING_PATTERN, convert_utc_time, DEFAULT_LOOKUP_PERIOD
+from obsei.source.base_source import BaseSource, BaseSourceConfig
+from obsei.source.website_crawler_source import (
+ BaseCrawlerConfig,
+ TrafilaturaCrawlerConfig,
+)
+
+GOOGLE_DATE_TIME_QUERY_PATTERN = "%Y-%m-%d"
+
+
+class GoogleNewsConfig(BaseSourceConfig):
+ _google_news_client: GoogleNews = PrivateAttr()
+ TYPE: str = "GoogleNews"
+ query: str
+ country: Optional[str] = "US"
+ language: Optional[str] = "en"
+ max_results: Optional[int] = 100
+ lookup_period: Optional[str] = None
+ after_date: Optional[str] = None # latest time
+ before_date: Optional[str] = None # oldest time
+ fetch_article: Optional[bool] = False
+ crawler_config: Optional[BaseCrawlerConfig] = None
+
+ def __init__(self, **data: Any):
+ super().__init__(**data)
+
+ if self.lookup_period and self.after_date:
+ raise AttributeError("Can't use `lookup_period` and `after_date` both")
+ elif not self.after_date and self.before_date:
+ raise AttributeError("Can't use `before_date` without `after_date` or `lookup_period`")
+
+ if self.lookup_period:
+ after_time = convert_utc_time(self.lookup_period)
+ self.after_date = after_time.strftime(GOOGLE_DATE_TIME_QUERY_PATTERN)
+
+ if not self.before_date:
+ before_time = datetime.combine(date.today(), time(tzinfo=timezone.utc)) + timedelta(days=1)
+ self.before_date = before_time.strftime(GOOGLE_DATE_TIME_QUERY_PATTERN)
+
+ self._google_news_client = GoogleNews(
+ lang=self.language,
+ region=self.country
+ )
+
+ if not self.crawler_config:
+ self.crawler_config = TrafilaturaCrawlerConfig(urls=[])
+
+ def get_client(self) -> GoogleNews:
+ return self._google_news_client
+
+
+class GoogleNewsSource(BaseSource):
+ NAME: Optional[str] = "GoogleNews"
+
+ def lookup(self, config: GoogleNewsConfig, **kwargs: Any) -> List[TextPayload]: # type: ignore[override]
+ source_responses: List[TextPayload] = []
+
+ # Get data from state
+ id: str = kwargs.get("id", None)
+ state: Optional[Dict[str, Any]] = (
+ None
+ if id is None or self.store is None
+ else self.store.get_source_state(id)
+ )
+ update_state: bool = True if id else False
+ state = state or dict()
+ lookup_period: str = state.get("since_time", None) or DEFAULT_LOOKUP_PERIOD
+ since_time: datetime = convert_utc_time(lookup_period)
+ last_since_time = since_time
+
+ today_start_of_day: datetime = datetime.combine(date.today(), time(tzinfo=timezone.utc))
+ today_end_of_day: datetime = today_start_of_day + timedelta(days=1)
+
+ last_after_time: datetime # start_time
+ if config.after_date:
+ last_after_time = convert_utc_time(config.after_date)
+ else:
+ last_after_time = today_start_of_day
+
+ if state.get("since_time", None) is not None:
+ last_after_time = since_time \
+ if since_time > last_after_time \
+ else last_since_time
+
+ before_time: datetime # end time
+ if config.before_date and config.after_date:
+ before_time = convert_utc_time(config.before_date)
+ else:
+ before_time = today_end_of_day
+
+ if before_time > today_start_of_day:
+ before_time = today_end_of_day
+
+ google_news_client = config.get_client()
+ more_data_exist = True
+ while more_data_exist and before_time > last_after_time:
+ after_time = before_time - timedelta(days=1)
+ after_date = after_time.strftime(GOOGLE_DATE_TIME_QUERY_PATTERN)
+ before_date = before_time.strftime(GOOGLE_DATE_TIME_QUERY_PATTERN)
+
+ new_query = f'{config.query}+after:{after_date}+before:{before_date}'
+ # query = parse.quote(new_query, errors='ignore')
+
+ before_time = after_time
+
+ google_news_client.get_news(new_query)
+ articles = google_news_client.results(sort=True)
+
+ for article in articles:
+ published_date = (
+ None
+ if article["datetime"] is None
+ else article["datetime"].replace(tzinfo=timezone.utc)
+ )
+
+ article_text: str = ""
+ if config.fetch_article and config.crawler_config:
+ extracted_data = config.crawler_config.extract_url(url=article["link"])
+
+ if extracted_data.get("text", None) is not None:
+ article_text = extracted_data["text"]
+ del extracted_data["text"]
+
+ article["extracted_data"] = extracted_data
+
+ source_responses.append(
+ TextPayload(
+ processed_text=f"{article['title']}.\n\n {article_text}",
+ meta=vars(article) if hasattr(article, "__dict__") else article,
+ source_name=self.NAME,
+ )
+ )
+
+ if config.max_results is not None and len(source_responses) >= config.max_results:
+ source_responses = source_responses[:config.max_results]
+ more_data_exist = False
+ break
+
+ if published_date and since_time and published_date < since_time:
+ more_data_exist = False
+ break
+ if last_since_time is None or (
+ published_date and last_since_time < published_date
+ ):
+ last_since_time = published_date
+
+ if update_state and last_since_time and self.store is not None:
+ state["since_time"] = last_since_time.strftime(DATETIME_STRING_PATTERN)
+ self.store.update_source_state(workflow_id=id, state=state)
+
+ return source_responses
diff --git a/obsei_module/obsei-master/obsei/source/pandas_source.py b/obsei_module/obsei-master/obsei/source/pandas_source.py
new file mode 100644
index 0000000000000000000000000000000000000000..537ecc25ba4b680d0e4146b85ff2ba937d33e833
--- /dev/null
+++ b/obsei_module/obsei-master/obsei/source/pandas_source.py
@@ -0,0 +1,54 @@
+from typing import List, Optional, Any
+
+from pandas import DataFrame
+
+from obsei.source.base_source import BaseSource, BaseSourceConfig
+from obsei.payload import TextPayload
+
+
+class PandasSourceConfig(BaseSourceConfig):
+ TYPE: str = "Pandas"
+
+ dataframe: DataFrame
+ text_columns: List[str]
+ separator: str = " "
+ include_columns: Optional[List[str]] = None
+
+ def __init__(self, **data: Any):
+ super().__init__(**data)
+
+ if len(self.text_columns) == 0:
+ raise ValueError("`text_columns` cannot be empty")
+
+ if not all(
+ [text_column in self.dataframe.columns for text_column in self.text_columns]
+ ):
+ raise ValueError("Every `text_columns` should be present in `dataframe`")
+
+ try:
+ self.dataframe[self.text_columns] = self.dataframe[
+ self.text_columns
+ ].astype("string")
+ except TypeError as e:
+ raise ValueError("Unable to convert `text_columns` to string dtype")
+
+
+class PandasSource(BaseSource):
+ NAME: Optional[str] = "Pandas"
+
+ def lookup(self, config: PandasSourceConfig, **kwargs: Any) -> List[TextPayload]: # type: ignore[override]
+ df_to_records = config.dataframe.to_dict("records")
+ source_responses: List[TextPayload] = [
+ TextPayload(
+ processed_text=config.separator.join(
+ [record.get(text_column) for text_column in config.text_columns]
+ ),
+ meta={key: record[key] for key in config.include_columns}
+ if config.include_columns is not None
+ else record,
+ source_name=self.NAME,
+ )
+ for record in df_to_records
+ ]
+
+ return source_responses
diff --git a/obsei_module/obsei-master/obsei/source/playstore_reviews.py b/obsei_module/obsei-master/obsei/source/playstore_reviews.py
new file mode 100644
index 0000000000000000000000000000000000000000..99ab15497f2837f62aa6e03f96903b393ff13aa6
--- /dev/null
+++ b/obsei_module/obsei-master/obsei/source/playstore_reviews.py
@@ -0,0 +1,128 @@
+from typing import Any, Dict, List, Optional
+
+from google.auth.credentials import Credentials
+from google.oauth2 import service_account
+from googleapiclient.discovery import build
+from pydantic import Field, SecretStr, PrivateAttr
+from pydantic_settings import BaseSettings
+
+from obsei.payload import TextPayload
+from obsei.source.base_source import BaseSource, BaseSourceConfig
+
+
+class GoogleCredInfo(BaseSettings):
+ # Currently only service_account_file type credential supported
+ # Refer: https://developers.google.com/identity/protocols/oauth2/service-account
+ service_cred_file: Optional[str] = Field(None, env="google_service_cred_file")
+ developer_key: Optional[SecretStr] = Field(None, env="google_developer_key")
+ scopes: List[str] = ["https://www.googleapis.com/auth/androidpublisher"]
+
+
+class PlayStoreConfig(BaseSourceConfig):
+ _credentials: Credentials = PrivateAttr()
+ TYPE: str = "PlayStore"
+ package_name: str
+ start_index: Optional[int] = None
+ max_results: int = 10
+ num_retries: int = 1
+ with_quota_project_id: Optional[str] = None
+ with_subject: Optional[str] = None
+ cred_info: Optional[GoogleCredInfo] = Field(None)
+
+ def __init__(self, **values: Any):
+ super().__init__(**values)
+
+ self.cred_info = self.cred_info or GoogleCredInfo()
+
+ if self.cred_info.service_cred_file is None or self.cred_info.developer_key is None:
+ raise ValueError("`service_cred_file` and `developer_key` can't be empty")
+
+ self._credentials = service_account.Credentials.from_service_account_file(
+ filename=self.cred_info.service_cred_file, scopes=self.cred_info.scopes
+ )
+
+ if self.with_quota_project_id is not None:
+ self._credentials = self._credentials.with_quota_project(self.with_quota_project_id)
+
+ if self.with_subject is not None:
+ self._credentials = self._credentials.with_subject(self.with_subject)
+
+ def get_google_credentials(self) -> Credentials:
+ return self._credentials
+
+ def get_developer_key(self) -> str:
+ if self.cred_info is None or self.cred_info.developer_key is None:
+ raise ValueError("`developer_key` can't be empty")
+ return self.cred_info.developer_key.get_secret_value()
+
+
+class PlayStoreSource(BaseSource):
+ NAME: str = "PlayStore"
+
+ def lookup(self, config: PlayStoreConfig, **kwargs: Any) -> List[TextPayload]: # type: ignore[override]
+ source_responses: List[TextPayload] = []
+ # Refer https://github.com/googleapis/google-api-python-client/blob/master/docs/start.md
+ with build(
+ serviceName="androidpublisher",
+ version="v3",
+ credentials=config.get_google_credentials(),
+ developerKey=config.get_developer_key(),
+ ) as service:
+ reviews = service.reviews()
+ pagination_token: Optional[str] = None
+
+ # Get data from state
+ id: str = kwargs.get("id", None)
+ state: Optional[Dict[str, Any]] = (
+ None
+ if id is None or self.store is None
+ else self.store.get_source_state(id)
+ )
+ start_index: Optional[int] = (
+ config.start_index or None
+ if state is None
+ else state.get("start_index", None)
+ )
+ update_state: bool = True if id else False
+ state = state or dict()
+ review_id = start_index
+
+ while True:
+ # Refer following link -
+ # https://googleapis.github.io/google-api-python-client/docs/dyn/androidpublisher_v3.reviews.html#list
+ responses = reviews.list(
+ package_name=config.package_name,
+ max_results=config.max_results,
+ start_index=start_index,
+ token=pagination_token,
+ )
+
+ if "reviews" in responses:
+ reviews = responses["responses"]
+ for review in reviews:
+ if "comments" not in review:
+ continue
+
+ review_id = review["reviewId"]
+
+ # Currently only one user comment is supported
+ text = review["comments"][0]["userComment"]["text"]
+ source_responses.append(
+ TextPayload(
+ processed_text=text, meta=review, source_name=self.NAME
+ )
+ )
+
+ pagination_token = None
+ if "tokenPagination" in responses:
+ if "nextPageToken" in responses["tokenPagination"]:
+ pagination_token = responses["tokenPagination"]["nextPageToken"]
+
+ if pagination_token is None:
+ break
+
+ if update_state and self.store is not None:
+ state["start_index"] = review_id
+ self.store.update_source_state(workflow_id=id, state=state)
+
+ return source_responses
diff --git a/obsei_module/obsei-master/obsei/source/playstore_scrapper.py b/obsei_module/obsei-master/obsei/source/playstore_scrapper.py
new file mode 100644
index 0000000000000000000000000000000000000000..a7e3472e18c38ba8380c766bc46774effd8f97bc
--- /dev/null
+++ b/obsei_module/obsei-master/obsei/source/playstore_scrapper.py
@@ -0,0 +1,168 @@
+import logging
+import re
+from datetime import datetime, timezone
+from typing import Any, Dict, List, Optional, Tuple
+from urllib import parse
+
+from google_play_scraper import Sort, reviews
+
+from obsei.misc.web_search import perform_search
+from obsei.source.base_source import BaseSource, BaseSourceConfig
+from obsei.payload import TextPayload
+from obsei.misc.utils import (
+ DATETIME_STRING_PATTERN,
+ DEFAULT_LOOKUP_PERIOD,
+ convert_utc_time,
+)
+
+logger = logging.getLogger(__name__)
+
+
+class PlayStoreScrapperConfig(BaseSourceConfig):
+ TYPE: str = "PlayStoreScrapper"
+ app_url: Optional[str] = None
+ countries: Optional[List[str]] = None
+ package_name: Optional[str] = None
+ app_name: Optional[str] = None
+ language: Optional[str] = None
+ filter_score_with: Optional[int] = None
+ lookup_period: Optional[str] = None
+ max_count: Optional[int] = 200
+
+ def __init__(self, **data: Any):
+ super().__init__(**data)
+
+ if self.app_url is not None:
+ self.package_name, self.countries, self.language = PlayStoreScrapperConfig.parse_app_url(self.app_url)
+ else:
+ if not self.package_name and self.app_name:
+ self.package_name = PlayStoreScrapperConfig.search_package_name(
+ self.app_name
+ )
+
+ if not self.package_name:
+ raise ValueError("Valid `package_name`, `app_name` or `app_url` is mandatory")
+
+ self.language = self.language or "en"
+ self.countries = self.countries or ["us"]
+ self.app_name = self.app_name or self.package_name
+
+ @classmethod
+ def parse_app_url(cls, app_url: str) -> Tuple[Optional[str], Optional[List[str]], Optional[str]]:
+
+ parsed_url = parse.urlparse(app_url)
+ query_dict = parse.parse_qs(parsed_url.query)
+ countries = query_dict.get('gl', None)
+
+ language = None
+ languages = query_dict.get('hl', None)
+ if languages is not None:
+ language = languages[0]
+
+ package_name = None
+ package_ids = query_dict.get('id', None)
+ if package_ids is not None:
+ package_name = package_ids[0]
+
+ return package_name, countries, language
+
+ @classmethod
+ def search_package_name(cls, app_name: str) -> str:
+ base_request_url = f"https://play.google.com"
+ search_response = perform_search(
+ request_url=base_request_url, query=f"play store {app_name}"
+ )
+
+ pattern = r"play.google.com/store/apps/details.+?id=([0-9a-z.]+)"
+ match_object = re.search(pattern, search_response.text)
+ if match_object:
+ app_id = str(match_object.group(1))
+ else:
+ raise RuntimeError("Pattern matching is not found")
+ return app_id
+
+
+class PlayStoreScrapperSource(BaseSource):
+ NAME: Optional[str] = "PlayStoreScrapper"
+
+ def lookup( # type: ignore[override]
+ self, config: PlayStoreScrapperConfig, **kwargs: Any
+ ) -> List[TextPayload]:
+ source_responses: List[TextPayload] = []
+
+ # Get data from state
+ id: str = kwargs.get("id", None)
+ state: Optional[Dict[str, Any]] = (
+ None
+ if id is None or self.store is None
+ else self.store.get_source_state(id)
+ )
+ update_state: bool = True if id else False
+ state = state or dict()
+
+ if config.countries is None or len(config.countries) == 0:
+ logger.warning("`countries` in config should not be empty or None")
+ return source_responses
+
+ for country in config.countries:
+ country_stat: Dict[str, Any] = state.get(country, dict())
+ lookup_period: str = country_stat.get("since_time", config.lookup_period)
+ lookup_period = lookup_period or DEFAULT_LOOKUP_PERIOD
+ if len(lookup_period) <= 5:
+ since_time = convert_utc_time(lookup_period)
+ else:
+ since_time = datetime.strptime(lookup_period, DATETIME_STRING_PATTERN)
+
+ last_since_time: datetime = since_time
+
+ # since_id: Optional[str] = country_stat.get("since_id", None)
+ # last_index = since_id
+ # state[scrapper.country] = country_stat
+
+ continuation_token = None
+ while True:
+ store_reviews, continuation_token = reviews(
+ app_id=config.package_name,
+ lang=config.language,
+ country=country,
+ sort=Sort.NEWEST,
+ filter_score_with=config.filter_score_with,
+ continuation_token=continuation_token,
+ count=config.max_count,
+ )
+ store_reviews = store_reviews or []
+
+ for review in store_reviews:
+ source_responses.append(
+ TextPayload(
+ processed_text=review["content"],
+ meta=review,
+ source_name=self.NAME,
+ )
+ )
+ review_time = review["at"].replace(tzinfo=timezone.utc)
+
+ if since_time > review_time:
+ break
+
+ if last_since_time is None or last_since_time < review_time:
+ last_since_time = review_time
+ # if last_index is None or last_index < review.id:
+ # last_index = review.id
+
+ if (
+ continuation_token is None
+ or continuation_token.token is None
+ or continuation_token.count <= len(source_responses)
+ ):
+ break
+
+ country_stat["since_time"] = last_since_time.strftime(
+ DATETIME_STRING_PATTERN
+ )
+ # country_stat["since_id"] = last_index
+
+ if update_state and self.store is not None:
+ self.store.update_source_state(workflow_id=id, state=state)
+
+ return source_responses
diff --git a/obsei_module/obsei-master/obsei/source/reddit_scrapper.py b/obsei_module/obsei-master/obsei/source/reddit_scrapper.py
new file mode 100644
index 0000000000000000000000000000000000000000..ca79601f1ba86a00fcd3904bcb506eb2fb331e38
--- /dev/null
+++ b/obsei_module/obsei-master/obsei/source/reddit_scrapper.py
@@ -0,0 +1,108 @@
+import logging
+from datetime import datetime, timezone
+from typing import Any, Dict, List, Optional
+
+import mmh3
+from pydantic import PrivateAttr
+from reddit_rss_reader.reader import RedditContent, RedditRSSReader
+
+from obsei.source.base_source import BaseSource, BaseSourceConfig
+from obsei.payload import TextPayload
+from obsei.misc.utils import (
+ DATETIME_STRING_PATTERN,
+ DEFAULT_LOOKUP_PERIOD,
+ convert_utc_time,
+)
+
+logger = logging.getLogger(__name__)
+
+
+class RedditScrapperConfig(BaseSourceConfig):
+ _scrapper: RedditRSSReader = PrivateAttr()
+ TYPE: str = "RedditScrapper"
+ url: str
+ url_id: Optional[str] = None
+ user_agent: Optional[str] = None
+ lookup_period: Optional[str] = None
+
+ def __init__(self, **data: Any):
+ super().__init__(**data)
+
+ # Using 32 bit hash
+ self.url_id = self.url_id or "{:02x}".format(mmh3.hash(self.url, signed=False))
+
+ self._scrapper = RedditRSSReader(
+ url=self.url,
+ user_agent=self.user_agent
+ if self.user_agent
+ else "script {url_hash}".format(url_hash=self.url_id),
+ )
+
+ def get_readers(self) -> RedditRSSReader:
+ return self._scrapper
+
+
+class RedditScrapperSource(BaseSource):
+ NAME: Optional[str] = "RedditScrapper"
+
+ def lookup(self, config: RedditScrapperConfig, **kwargs: Any) -> List[TextPayload]: # type: ignore[override]
+ source_responses: List[TextPayload] = []
+
+ # Get data from state
+ identifier: str = kwargs.get("id", None)
+ state: Optional[Dict[str, Any]] = (
+ None
+ if identifier is None or self.store is None
+ else self.store.get_source_state(identifier)
+ )
+ update_state: bool = True if identifier else False
+ state = state or dict()
+
+ scrapper_stat: Dict[str, Any] = (
+ dict() if not config.url_id else state.get(config.url_id, dict())
+ )
+ lookup_period: str = scrapper_stat.get("since_time", config.lookup_period)
+ lookup_period = lookup_period or DEFAULT_LOOKUP_PERIOD
+ since_time = convert_utc_time(lookup_period)
+
+ last_since_time: datetime = since_time
+
+ since_id: Optional[str] = scrapper_stat.get("since_id", None)
+ last_index = since_id
+ if config.url_id:
+ state[config.url_id] = scrapper_stat
+
+ reddit_data: Optional[List[RedditContent]] = None
+ try:
+ reddit_data = config.get_readers().fetch_content(
+ after=since_time, since_id=since_id
+ )
+ except RuntimeError as ex:
+ logger.warning(ex.__cause__)
+
+ reddit_data = reddit_data or []
+
+ for reddit in reddit_data:
+ source_responses.append(
+ TextPayload(
+ processed_text=f"{reddit.title}. {reddit.extracted_text}",
+ meta=reddit.__dict__,
+ source_name=self.NAME,
+ )
+ )
+
+ comment_time = reddit.updated.replace(tzinfo=timezone.utc)
+
+ if last_since_time is None or last_since_time < comment_time:
+ last_since_time = comment_time
+ if last_index is None:
+ # Assuming list is sorted based on time
+ last_index = reddit.id
+
+ scrapper_stat["since_time"] = last_since_time.strftime(DATETIME_STRING_PATTERN)
+ scrapper_stat["since_id"] = last_index
+
+ if update_state and self.store is not None:
+ self.store.update_source_state(workflow_id=identifier, state=state)
+
+ return source_responses
diff --git a/obsei_module/obsei-master/obsei/source/reddit_source.py b/obsei_module/obsei-master/obsei/source/reddit_source.py
new file mode 100644
index 0000000000000000000000000000000000000000..67a0603c4ab9bdfd20e95472f296d5a7d0b51ea7
--- /dev/null
+++ b/obsei_module/obsei-master/obsei/source/reddit_source.py
@@ -0,0 +1,150 @@
+from datetime import datetime
+from typing import Any, Dict, List, Optional
+
+from praw import Reddit
+from pydantic import Field, PrivateAttr, SecretStr
+from pydantic_settings import BaseSettings
+
+from obsei.payload import TextPayload
+from obsei.misc.utils import (
+ DATETIME_STRING_PATTERN,
+ DEFAULT_LOOKUP_PERIOD,
+ convert_utc_time,
+ text_from_html,
+)
+from obsei.source.base_source import BaseSource, BaseSourceConfig
+
+
+class RedditCredInfo(BaseSettings):
+ # Create credential at https://www.reddit.com/prefs/apps
+ # Also refer https://praw.readthedocs.io/en/latest/getting_started/authentication.html
+ # Currently Password Flow, Read Only Mode and Saved Refresh Token Mode are supported
+ client_id: SecretStr = Field(None, env="reddit_client_id")
+ client_secret: SecretStr = Field(None, env="reddit_client_secret")
+ user_agent: str = "Test User Agent"
+ redirect_uri: Optional[str] = None
+ refresh_token: Optional[SecretStr] = Field(None, env="reddit_refresh_token")
+ username: Optional[str] = Field(None, env="reddit_username")
+ password: Optional[SecretStr] = Field(None, env="reddit_password")
+ read_only: bool = True
+
+
+class RedditConfig(BaseSourceConfig):
+ # This is done to avoid exposing member to API response
+ _reddit_client: Reddit = PrivateAttr()
+ TYPE: str = "Reddit"
+ subreddits: List[str]
+ post_ids: Optional[List[str]] = None
+ lookup_period: Optional[str] = None
+ include_post_meta: Optional[bool] = True
+ post_meta_field: str = "post_meta"
+ cred_info: Optional[RedditCredInfo] = Field(None)
+
+ def __init__(self, **data: Any):
+ super().__init__(**data)
+
+ self.cred_info = self.cred_info or RedditCredInfo()
+
+ self._reddit_client = Reddit(
+ client_id=self.cred_info.client_id.get_secret_value(),
+ client_secret=self.cred_info.client_secret.get_secret_value(),
+ redirect_uri=self.cred_info.redirect_uri,
+ user_agent=self.cred_info.user_agent,
+ refresh_token=self.cred_info.refresh_token.get_secret_value()
+ if self.cred_info.refresh_token
+ else None,
+ username=self.cred_info.username if self.cred_info.username else None,
+ password=self.cred_info.password.get_secret_value()
+ if self.cred_info.password
+ else None,
+ )
+
+ self._reddit_client.read_only = self.cred_info.read_only
+
+ def get_reddit_client(self) -> Reddit:
+ return self._reddit_client
+
+
+class RedditSource(BaseSource):
+ NAME: str = "Reddit"
+
+ def lookup(self, config: RedditConfig, **kwargs: Any) -> List[TextPayload]: # type: ignore[override]
+ source_responses: List[TextPayload] = []
+
+ # Get data from state
+ id: str = kwargs.get("id", None)
+ state: Optional[Dict[str, Any]] = (
+ None
+ if id is None or self.store is None
+ else self.store.get_source_state(id)
+ )
+ update_state: bool = True if id else False
+ state = state or dict()
+
+ subreddit_reference = config.get_reddit_client().subreddit(
+ "+".join(config.subreddits)
+ )
+ post_stream = subreddit_reference.stream.submissions(pause_after=-1)
+ for post in post_stream:
+ if post is None:
+ break
+
+ post_data = vars(post)
+ post_id = post_data["id"]
+ if config.post_ids and not config.post_ids.__contains__(post_id):
+ continue
+
+ post_stat: Dict[str, Any] = state.get(post_id, dict())
+ lookup_period: str = post_stat.get("since_time", config.lookup_period)
+ lookup_period = lookup_period or DEFAULT_LOOKUP_PERIOD
+ if len(lookup_period) <= 5:
+ since_time = convert_utc_time(lookup_period)
+ else:
+ since_time = datetime.strptime(lookup_period, DATETIME_STRING_PATTERN)
+
+ last_since_time: datetime = since_time
+
+ since_id: Optional[str] = post_stat.get("since_comment_id", None)
+ last_index = since_id
+ state[post_id] = post_stat
+
+ post.comment_sort = "new"
+ post.comments.replace_more(limit=None)
+
+ # top_level_comments only
+ first_comment = True
+ for comment in post.comments:
+ comment_data = vars(comment)
+ if config.include_post_meta:
+ comment_data[config.post_meta_field] = post_data
+
+ comment_time = datetime.utcfromtimestamp(
+ int(comment_data["created_utc"])
+ )
+ comment_id = comment_data["id"]
+
+ if comment_time < since_time:
+ break
+ if last_index and last_index == comment_id:
+ break
+ if last_since_time is None or last_since_time < comment_time:
+ last_since_time = comment_time
+ if last_index is None or first_comment:
+ last_index = comment_id
+ first_comment = False
+
+ text = "".join(text_from_html(comment_data["body_html"]))
+
+ source_responses.append(
+ TextPayload(
+ processed_text=text, meta=comment_data, source_name=self.NAME
+ )
+ )
+
+ post_stat["since_time"] = last_since_time.strftime(DATETIME_STRING_PATTERN)
+ post_stat["since_comment_id"] = last_index
+
+ if update_state and self.store is not None:
+ self.store.update_source_state(workflow_id=id, state=state)
+
+ return source_responses
diff --git a/obsei_module/obsei-master/obsei/source/test1.py b/obsei_module/obsei-master/obsei/source/test1.py
new file mode 100644
index 0000000000000000000000000000000000000000..7f76dd482ba6f39dc7304fdf299cf3bbfc9c37eb
--- /dev/null
+++ b/obsei_module/obsei-master/obsei/source/test1.py
@@ -0,0 +1,11 @@
+from website_crawler_source import TrafilaturaCrawlerConfig
+import json
+if __name__ == "__main__":
+ # Create an instance of TrafilaturaCrawlerConfig
+ config = TrafilaturaCrawlerConfig(
+ urls=["https://nld.com.vn/10-tro-ly-cap-cao-cua-tong-thong-han-quoc-dong-loat-tu-chuc-196241204104419743.htm"]
+ )
+ result = config.extract_url("https://nld.com.vn/10-tro-ly-cap-cao-cua-tong-thong-han-quoc-dong-loat-tu-chuc-196241204104419743.htm")
+
+ # Print the result
+ print(json.dumps(result['text'], indent=4, ensure_ascii=False))
\ No newline at end of file
diff --git a/obsei_module/obsei-master/obsei/source/twitter_source.py b/obsei_module/obsei-master/obsei/source/twitter_source.py
new file mode 100644
index 0000000000000000000000000000000000000000..819a4f502c973f234759cd67acfe759e3e8f620e
--- /dev/null
+++ b/obsei_module/obsei-master/obsei/source/twitter_source.py
@@ -0,0 +1,327 @@
+import logging
+from datetime import datetime
+
+import pytz
+import requests
+
+from typing import Any, Dict, List, Optional
+
+from pydantic import Field
+from pydantic.types import SecretStr
+from pydantic_settings import BaseSettings
+from searchtweets import collect_results, gen_request_parameters
+
+from obsei.source.base_source import BaseSource, BaseSourceConfig
+from obsei.payload import TextPayload
+
+from obsei.misc.utils import convert_utc_time
+
+logger = logging.getLogger(__name__)
+
+TWITTER_OAUTH_ENDPOINT = "https://api.twitter.com/oauth2/token"
+
+DEFAULT_MAX_TWEETS = 10
+
+DEFAULT_TWEET_FIELDS = [
+ "author_id",
+ "conversation_id",
+ "created_at",
+ "entities",
+ "geo",
+ "id",
+ "in_reply_to_user_id",
+ "lang",
+ "public_metrics",
+ "referenced_tweets",
+ "source",
+ "text",
+ "withheld",
+]
+DEFAULT_EXPANSIONS = [
+ "author_id",
+ "entities.mentions.username",
+ "geo.place_id",
+ "in_reply_to_user_id",
+ "referenced_tweets.id",
+ "referenced_tweets.id.author_id",
+]
+DEFAULT_PLACE_FIELDS = [
+ "contained_within",
+ "country",
+ "country_code",
+ "full_name",
+ "geo",
+ "id",
+ "name",
+ "place_type",
+]
+DEFAULT_USER_FIELDS = [
+ "created_at",
+ "description",
+ "entities",
+ "id",
+ "location",
+ "name",
+ "public_metrics",
+ "url",
+ "username",
+ "verified",
+]
+DEFAULT_OPERATORS = ["-is:reply", "-is:retweet"]
+
+
+class TwitterCredentials(BaseSettings):
+ bearer_token: SecretStr = Field("", env="twitter_bearer_token")
+ consumer_key: SecretStr = Field("", env="twitter_consumer_key")
+ consumer_secret: SecretStr = Field("", env="twitter_consumer_secret")
+ endpoint: str = Field(
+ "https://api.twitter.com/2/tweets/search/recent", env="twitter_endpoint"
+ )
+ extra_headers_dict: Optional[Dict[str, Any]] = None
+
+
+class TwitterSourceConfig(BaseSourceConfig):
+ TYPE: str = "Twitter"
+ query: Optional[str] = None
+ keywords: Optional[List[str]] = None
+ hashtags: Optional[List[str]] = None
+ usernames: Optional[List[str]] = None
+ operators: Optional[List[str]] = Field(DEFAULT_OPERATORS)
+ since_id: Optional[int] = None
+ until_id: Optional[int] = None
+ lookup_period: Optional[str] = None
+ tweet_fields: Optional[List[str]] = Field(DEFAULT_TWEET_FIELDS)
+ user_fields: Optional[List[str]] = Field(DEFAULT_USER_FIELDS)
+ expansions: Optional[List[str]] = Field(DEFAULT_EXPANSIONS)
+ place_fields: Optional[List[str]] = Field(DEFAULT_PLACE_FIELDS)
+ max_tweets: int = DEFAULT_MAX_TWEETS
+ cred_info: TwitterCredentials = Field(None)
+ credential: Optional[TwitterCredentials] = None
+
+ def __init__(self, **data: Any):
+ super().__init__(**data)
+
+ self.cred_info = self.cred_info or TwitterCredentials()
+
+ if self.credential is not None:
+ logger.warning("`credential` is deprecated; use `cred_info`")
+ self.cred_info = self.credential
+
+ if self.cred_info.bearer_token.get_secret_value() == '':
+ if self.cred_info.consumer_key.get_secret_value() == '' \
+ or self.cred_info.consumer_secret.get_secret_value() == '':
+ raise AttributeError(
+ "consumer_key and consumer_secret required to generate bearer_token via Twitter"
+ )
+
+ self.cred_info.bearer_token = SecretStr(self.generate_bearer_token())
+
+ if self.max_tweets > 100:
+ logger.warning("Twitter API support max 100 tweets per call, hence resetting `max_tweets` to 100")
+ self.max_tweets = 100
+
+ def get_twitter_credentials(self) -> Dict[str, Any]:
+ if self.cred_info.bearer_token.get_secret_value() == '':
+ self.cred_info.bearer_token = SecretStr(self.generate_bearer_token())
+
+ return {
+ "bearer_token": self.cred_info.bearer_token.get_secret_value(),
+ "endpoint": self.cred_info.endpoint,
+ "extra_headers_dict": self.cred_info.extra_headers_dict,
+ }
+
+ # Copied from Twitter searchtweets-v2 lib
+ def generate_bearer_token(self) -> str:
+ """
+ Return the bearer token for a given pair of consumer key and secret values.
+ """
+ data = [("grant_type", "client_credentials")]
+ resp = requests.post(
+ TWITTER_OAUTH_ENDPOINT,
+ data=data,
+ auth=(
+ self.cred_info.consumer_key.get_secret_value(),
+ self.cred_info.consumer_secret.get_secret_value(),
+ ),
+ )
+ logger.warning("Grabbing bearer token from OAUTH")
+ if resp.status_code >= 400:
+ logger.error(resp.text)
+ resp.raise_for_status()
+
+ return str(resp.json()["access_token"])
+
+
+class TwitterSource(BaseSource):
+ NAME: str = "Twitter"
+
+ def lookup(self, config: TwitterSourceConfig, **kwargs: Any) -> List[TextPayload]: # type: ignore[override]
+ if (
+ not config.query
+ and not config.keywords
+ and not config.hashtags
+ and not config.usernames
+ ):
+ raise AttributeError(
+ "At least one non empty parameter required (query, keywords, hashtags, and usernames)"
+ )
+
+ place_fields = (
+ ",".join(config.place_fields) if config.place_fields is not None else None
+ )
+ user_fields = (
+ ",".join(config.user_fields) if config.user_fields is not None else None
+ )
+ expansions = (
+ ",".join(config.expansions) if config.expansions is not None else None
+ )
+ tweet_fields = (
+ ",".join(config.tweet_fields) if config.tweet_fields is not None else None
+ )
+
+ # Get data from state
+ identifier: str = kwargs.get("id", None)
+ state: Optional[Dict[str, Any]] = (
+ None
+ if identifier is None or self.store is None
+ else self.store.get_source_state(identifier)
+ )
+ since_id: Optional[int] = (
+ config.since_id or None if state is None else state.get("since_id", None)
+ )
+ until_id: Optional[int] = (
+ config.until_id or None if state is None else state.get("until_id", None)
+ )
+ update_state: bool = True if identifier else False
+ state = state or dict()
+ max_tweet_id = since_id
+ lookup_period = config.lookup_period
+ if lookup_period is None:
+ start_time = None
+ elif len(lookup_period) <= 5:
+ start_time = convert_utc_time(lookup_period).replace(tzinfo=pytz.UTC)
+ else:
+ start_time = datetime.strptime(lookup_period, "%Y-%m-%dT%H:%M:%S%z")
+
+ if since_id or until_id:
+ lookup_period = None
+
+ query = self._generate_query_string(
+ query=config.query,
+ keywords=config.keywords,
+ hashtags=config.hashtags,
+ usernames=config.usernames,
+ operators=config.operators,
+ )
+
+ source_responses: List[TextPayload] = []
+
+ search_query = gen_request_parameters(
+ granularity=None,
+ query=query,
+ results_per_call=config.max_tweets,
+ place_fields=place_fields,
+ expansions=expansions,
+ user_fields=user_fields,
+ tweet_fields=tweet_fields,
+ since_id=since_id,
+ until_id=until_id,
+ start_time=lookup_period,
+ stringify=False,
+ )
+ logger.info(search_query)
+
+ tweets_output = collect_results(
+ query=search_query,
+ max_tweets=config.max_tweets,
+ result_stream_args=config.get_twitter_credentials(),
+ )
+
+ tweets: List[Dict[str, Any]] = []
+ users: List[Dict[str, Any]] = []
+ meta_info: Dict[str, Any] = {}
+
+ if not tweets_output and len(tweets_output) == 0:
+ logger.info("No Tweets found")
+ else:
+ tweets = tweets_output[0]["data"] if "data" in tweets_output[0] else tweets
+ if "includes" in tweets_output[0] and "users" in tweets_output[0]["includes"]:
+ users = tweets_output[0]["includes"]["users"]
+ meta_info = tweets_output[0]["meta"] if "meta" in tweets_output[0] else meta_info
+
+ # Extract user info and create user map
+ user_map: Dict[str, Dict[str, Any]] = {}
+ if len(users) > 0 and "id" in users[0]:
+ for user in users:
+ if "username" in user:
+ user["user_url"] = f'https://twitter.com/{user["username"]}'
+ user_map[user["id"]] = user
+
+ logger.info(f"Twitter API meta_info='{meta_info}'")
+
+ for tweet in tweets:
+ if "author_id" in tweet and tweet["author_id"] in user_map:
+ tweet["author_info"] = user_map.get(tweet["author_id"])
+
+ source_responses.append(self._get_source_output(tweet))
+
+ if start_time:
+ created_date = datetime.strptime(
+ tweet["created_at"], "%Y-%m-%dT%H:%M:%S.%f%z"
+ )
+ if start_time > created_date:
+ break
+
+ max_tweet_id = meta_info["newest_id"] if "newest_id" in meta_info else max_tweet_id
+ # min_tweet_id = meta_info["oldest_id"] if "oldest_id" in meta_info else min_tweet_id
+
+ if update_state and self.store is not None:
+ state["since_id"] = max_tweet_id
+ self.store.update_source_state(workflow_id=identifier, state=state)
+
+ return source_responses
+
+ @staticmethod
+ def _generate_query_string(
+ query: Optional[str] = None,
+ keywords: Optional[List[str]] = None,
+ hashtags: Optional[List[str]] = None,
+ usernames: Optional[List[str]] = None,
+ operators: Optional[List[str]] = None,
+ ) -> str:
+ if query:
+ return query
+
+ or_tokens = []
+ and_tokens = []
+
+ or_tokens_list = [keywords, hashtags, usernames]
+ for tokens in or_tokens_list:
+ if tokens:
+ if len(tokens) > 0:
+ or_tokens.append(f'({" OR ".join(tokens)})')
+ else:
+ or_tokens.append(f'{"".join(tokens)}')
+
+ and_query_str = ""
+ or_query_str = ""
+
+ if or_tokens:
+ if len(or_tokens) > 0:
+ or_query_str = f'{" OR ".join(or_tokens)}'
+ else:
+ or_query_str = f'{"".join(or_tokens)}'
+
+ if operators:
+ and_tokens.append(f'{" ".join(operators)}')
+
+ if and_tokens:
+ and_query_str = f' ({" ".join(and_tokens)})' if and_tokens else ""
+
+ return or_query_str + and_query_str
+
+ def _get_source_output(self, tweet: Dict[str, Any]) -> TextPayload:
+ tweet["tweet_url"] = f'https://twitter.com/twitter/statuses/{tweet["id"]}'
+ return TextPayload(
+ processed_text=tweet["text"], meta=tweet, source_name=self.NAME
+ )
diff --git a/obsei_module/obsei-master/obsei/source/website_crawler_source.py b/obsei_module/obsei-master/obsei/source/website_crawler_source.py
new file mode 100644
index 0000000000000000000000000000000000000000..f70b6241f6e3df506f8de420e262e6c10e3852e4
--- /dev/null
+++ b/obsei_module/obsei-master/obsei/source/website_crawler_source.py
@@ -0,0 +1,144 @@
+import json
+import logging
+from abc import abstractmethod
+from typing import List, Optional, Dict, Any
+
+import mmh3
+
+from obsei.payload import TextPayload
+from obsei.source.base_source import BaseSource, BaseSourceConfig
+
+logger = logging.getLogger(__name__)
+
+
+class BaseCrawlerConfig(BaseSourceConfig):
+ TYPE: str = "BaseCrawler"
+
+ @abstractmethod
+ def extract_url(self, url: str, url_id: Optional[str] = None) -> Dict[str, Any]:
+ pass
+
+ @abstractmethod
+ def find_urls(self, url: str) -> List[str]:
+ pass
+
+
+class TrafilaturaCrawlerConfig(BaseCrawlerConfig):
+ # To understand about these configuration params refer:
+ # https://trafilatura.readthedocs.io/
+ _output_format: str = "json"
+ TYPE: str = "Crawler"
+ urls: List[str]
+ include_comments: bool = False
+ include_tables: bool = True
+ no_fallback: bool = False
+ include_images: bool = False
+ include_formatting: bool = False
+ deduplicate: bool = True
+ no_ssl: bool = False
+ is_feed: bool = False
+ is_sitemap: bool = False
+ include_links: bool = True
+ target_language: Optional[str] = None
+ url_blacklist: Optional[List[str]] = None
+
+ def extract_url(self, url: str, url_id: Optional[str] = None) -> Dict[str, Any]:
+ try:
+ from trafilatura import extract, fetch_url
+ except:
+ logger.error("Trafilatura is not installed, install as follows: pip install trafilatura")
+ return {}
+
+ url_id = url_id or "{:02x}".format(mmh3.hash(url, signed=False))
+ url_content = fetch_url(
+ url=url,
+ no_ssl=self.no_ssl,
+ )
+ extracted_dict: Dict[str, Any] = {}
+ if url_content is not None:
+ extracted_data = extract(
+ filecontent=url_content,
+ record_id=url_id,
+ no_fallback=self.no_fallback,
+ output_format=self._output_format,
+ include_comments=self.include_comments,
+ include_tables=self.include_tables,
+ include_images=self.include_images,
+ include_formatting=self.include_formatting,
+ include_links=self.include_links,
+ deduplicate=self.deduplicate,
+ url_blacklist=self.url_blacklist,
+ target_language=self.target_language,
+ )
+
+ if extracted_data:
+ extracted_dict = json.loads(extracted_data)
+ if "raw-text" in extracted_dict:
+ del extracted_dict["raw-text"]
+
+ return extracted_dict
+
+ def find_urls(self, url: str) -> List[str]:
+ try:
+ from trafilatura import feeds, sitemaps
+ except:
+ logger.error("Trafilatura is not installed, install as follows: pip install trafilatura")
+ return []
+
+ urls: List[str] = []
+ if self.is_sitemap:
+ urls = sitemaps.sitemap_search(url=url, target_lang=self.target_language)
+ elif self.is_feed:
+ urls = feeds.find_feed_urls(url=url, target_lang=self.target_language)
+
+ return urls
+
+
+class TrafilaturaCrawlerSource(BaseSource):
+ NAME: Optional[str] = "Crawler"
+
+ def lookup( # type: ignore[override]
+ self, config: TrafilaturaCrawlerConfig, **kwargs: Any
+ ) -> List[TextPayload]:
+ source_responses: List[TextPayload] = []
+
+ final_urls = []
+ if config.is_sitemap or config.is_feed:
+ for url in config.urls:
+ final_urls.extend(config.find_urls(url=url))
+ else:
+ final_urls = config.urls
+
+ for url in final_urls:
+ extracted_data = config.extract_url(url=url)
+ if extracted_data is None:
+ logger.warning(f"Unable to crawl {url}, hence skipping it")
+ continue
+ comments = (
+ "" if "comments" not in extracted_data else extracted_data["comments"]
+ )
+ source_responses.append(
+ TextPayload(
+ processed_text=f"{extracted_data['text']}. {comments}",
+ meta=extracted_data,
+ source_name=self.NAME,
+ )
+ )
+
+ return source_responses
+
+if __name__ == "__main__":
+ # Tạo đối tượng cấu hình với danh sách URL cần crawl
+ config = TrafilaturaCrawlerConfig(
+ urls=["https://nld.com.vn/10-tro-ly-cap-cao-cua-tong-thong-han-quoc-dong-loat-tu-chuc-196241204104419743.htm"]
+ )
+
+ # Tạo đối tượng TrafilaturaCrawlerSource
+ source = TrafilaturaCrawlerSource()
+
+ # Sử dụng hàm lookup để crawl dữ liệu
+ results = source.lookup(config=config)
+
+ # In kết quả trả về
+ for result in results:
+ print(json.dumps(result.dict(), indent=4, ensure_ascii=False))
diff --git a/obsei_module/obsei-master/obsei/source/youtube_reviews.py b/obsei_module/obsei-master/obsei/source/youtube_reviews.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/obsei_module/obsei-master/obsei/source/youtube_scrapper.py b/obsei_module/obsei-master/obsei/source/youtube_scrapper.py
new file mode 100644
index 0000000000000000000000000000000000000000..406a03a522ddd39d29c846731bf69ad2e78b7d32
--- /dev/null
+++ b/obsei_module/obsei-master/obsei/source/youtube_scrapper.py
@@ -0,0 +1,111 @@
+import logging
+from datetime import datetime
+
+from pydantic import PrivateAttr
+from typing import Optional, List, Any, Dict
+
+from obsei.misc.utils import DEFAULT_LOOKUP_PERIOD, convert_utc_time, DATETIME_STRING_PATTERN
+from obsei.misc.youtube_reviews_scrapper import YouTubeCommentExtractor
+from obsei.payload import TextPayload
+from obsei.source.base_source import BaseSource, BaseSourceConfig
+
+logger = logging.getLogger(__name__)
+
+
+class YoutubeScrapperConfig(BaseSourceConfig):
+ _YT_VIDEO_URL: str = PrivateAttr('https://www.youtube.com/watch?v={video_id}')
+ TYPE: str = "YoutubeScrapper"
+ video_id: Optional[str] = None
+ video_url: Optional[str] = None
+ user_agent: str = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'
+ sort_by: int = 1 # 0 = sort by popular, 1 = sort by recent
+ max_comments: Optional[int] = 20
+ fetch_replies: bool = False
+ lang_code: Optional[str] = None
+ sleep_time: float = 0.1
+ request_retries: int = 5
+ lookup_period: Optional[str] = None
+
+ def __init__(self, **data: Any):
+ super().__init__(**data)
+
+ if not self.video_id and not self.video_url:
+ raise ValueError("Either `video_id` or `video_url` is required")
+
+ if not self.video_url:
+ self.video_url = self._YT_VIDEO_URL.format(video_id=self.video_id)
+
+
+class YoutubeScrapperSource(BaseSource):
+ NAME: Optional[str] = "YoutubeScrapper"
+
+ def lookup(self, config: YoutubeScrapperConfig, **kwargs: Any) -> List[TextPayload]: # type: ignore[override]
+ source_responses: List[TextPayload] = []
+
+ # Get data from state
+ identifier: str = kwargs.get("id", None)
+ state: Optional[Dict[str, Any]] = (
+ None
+ if id is None or self.store is None
+ else self.store.get_source_state(identifier)
+ )
+ update_state: bool = True if identifier else False
+ state = state or dict()
+
+ lookup_period: str = state.get("since_time", config.lookup_period)
+ lookup_period = lookup_period or DEFAULT_LOOKUP_PERIOD
+ if len(lookup_period) <= 5:
+ since_time = convert_utc_time(lookup_period)
+ else:
+ since_time = datetime.strptime(lookup_period, DATETIME_STRING_PATTERN)
+
+ last_since_time: datetime = since_time
+ since_id: Optional[str] = state.get("since_id", None)
+ last_index = since_id
+
+ comments: Optional[List[Dict[str, Any]]] = None
+ try:
+ if not config.video_url:
+ raise RuntimeError("`video_url` in config should not be empty or None")
+
+ scrapper: YouTubeCommentExtractor = YouTubeCommentExtractor(
+ video_url=config.video_url,
+ user_agent=config.user_agent,
+ sort_by=config.sort_by,
+ max_comments=config.max_comments,
+ fetch_replies=config.fetch_replies,
+ lang_code=config.lang_code,
+ sleep_time=config.sleep_time,
+ request_retries=config.request_retries,
+ )
+
+ comments = scrapper.fetch_comments(until_datetime=since_time)
+ except RuntimeError as ex:
+ logger.warning(ex.__cause__)
+
+ comments = comments or []
+
+ for comment in comments:
+ source_responses.append(
+ TextPayload(
+ processed_text=comment["text"],
+ meta=comment,
+ source_name=self.NAME,
+ )
+ )
+
+ comment_time = comment["time"]
+
+ if comment_time is not None and (last_since_time is None or last_since_time < comment_time):
+ last_since_time = comment_time
+ if last_index is None:
+ # Assuming list is sorted based on time
+ last_index = comment["comment_id"]
+
+ state["since_time"] = last_since_time.strftime(DATETIME_STRING_PATTERN)
+ state["since_id"] = last_index
+
+ if update_state and self.store is not None:
+ self.store.update_source_state(workflow_id=identifier, state=state)
+
+ return source_responses
diff --git a/obsei_module/obsei-master/obsei/workflow/__init__.py b/obsei_module/obsei-master/obsei/workflow/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/obsei_module/obsei-master/obsei/workflow/base_store.py b/obsei_module/obsei-master/obsei/workflow/base_store.py
new file mode 100644
index 0000000000000000000000000000000000000000..4cd761bdacac404cc85e076010ad0e88eb397661
--- /dev/null
+++ b/obsei_module/obsei-master/obsei/workflow/base_store.py
@@ -0,0 +1,34 @@
+from abc import abstractmethod
+from typing import Any, Dict, Optional
+
+from pydantic_settings import BaseSettings
+
+
+class BaseStore(BaseSettings):
+ @abstractmethod
+ def get_source_state(self, id: str) -> Optional[Dict[str, Any]]:
+ pass
+
+ @abstractmethod
+ def get_sink_state(self, id: str) -> Optional[Dict[str, Any]]:
+ pass
+
+ @abstractmethod
+ def get_analyzer_state(self, id: str) -> Optional[Dict[str, Any]]:
+ pass
+
+ @abstractmethod
+ def update_source_state(self, workflow_id: str, state: Dict[str, Any]) -> Optional[Any]:
+ pass
+
+ @abstractmethod
+ def update_sink_state(self, workflow_id: str, state: Dict[str, Any]) -> None:
+ pass
+
+ @abstractmethod
+ def update_analyzer_state(self, workflow_id: str, state: Dict[str, Any]) -> None:
+ pass
+
+ @abstractmethod
+ def delete_workflow(self, id: str) -> None:
+ pass
diff --git a/obsei_module/obsei-master/obsei/workflow/store.py b/obsei_module/obsei-master/obsei/workflow/store.py
new file mode 100644
index 0000000000000000000000000000000000000000..73501cf99d8e2c76bbb83ae046a94ac7b6a0738f
--- /dev/null
+++ b/obsei_module/obsei-master/obsei/workflow/store.py
@@ -0,0 +1,198 @@
+import json
+import logging
+from typing import Any, Dict, List, Optional
+from uuid import uuid4
+
+from pydantic import PrivateAttr
+from sqlalchemy import Column, DateTime, String, create_engine, func
+from sqlalchemy.ext.declarative import declarative_base
+from sqlalchemy.orm import sessionmaker
+
+from obsei.misc.utils import obj_to_json
+from obsei.workflow.base_store import BaseStore
+from obsei.workflow.workflow import WorkflowState, WorkflowConfig, Workflow
+
+logger = logging.getLogger(__name__)
+
+Base = declarative_base() # type: Any
+
+
+class ORMBase(Base): # type: ignore
+ __abstract__ = True
+
+ id = Column(String(100), default=lambda: str(uuid4()), primary_key=True)
+ created = Column(DateTime, server_default=func.now())
+ updated = Column(DateTime, server_default=func.now(), server_onupdate=func.now())
+
+
+class WorkflowTable(ORMBase):
+ __tablename__ = "workflow"
+
+ config = Column(String(2000), nullable=False)
+ source_state = Column(String(500), nullable=True)
+ sink_state = Column(String(500), nullable=True)
+ analyzer_state = Column(String(500), nullable=True)
+
+
+class WorkflowStore(BaseStore):
+ _session: sessionmaker = PrivateAttr()
+
+ def __init__(self, url: str = "sqlite:///obsei.db", **data: Any):
+ super().__init__(**data)
+ engine = create_engine(url)
+ ORMBase.metadata.create_all(engine)
+ local_session = sessionmaker(bind=engine)
+ self._session = local_session()
+
+ def get(self, identifier: str) -> Optional[Workflow]:
+ row = self._session.query(WorkflowTable).filter_by(id=identifier).all()
+ return (
+ None
+ if row is None or len(row) == 0
+ else self._convert_sql_row_to_workflow_data(row[0])
+ )
+
+ def get_all(self) -> List[Workflow]:
+ rows = self._session.query(WorkflowTable).all()
+ return [self._convert_sql_row_to_workflow_data(row) for row in rows]
+
+ def get_workflow_state(self, identifier: str) -> Optional[WorkflowState]:
+ row = (
+ self._session.query(
+ WorkflowTable.source_state,
+ WorkflowTable.analyzer_state,
+ WorkflowTable.sink_state,
+ )
+ .filter(id=identifier)
+ .all()
+ )
+
+ return (
+ None
+ if row is None or len(row) == 0
+ else self._convert_sql_row_to_workflow_state(row[0])
+ )
+
+ def get_source_state(self, identifier: str) -> Optional[Dict[str, Any]]:
+ row = (
+ self._session.query(WorkflowTable.source_state)
+ .filter(WorkflowTable.id == identifier)
+ .all()
+ )
+ return None if row[0].source_state is None else json.loads(row[0].source_state)
+
+ def get_sink_state(self, identifier: str) -> Optional[Dict[str, Any]]:
+ row = self._session.query(WorkflowTable.sink_state).filter(id=identifier).all()
+ return None if row[0].sink_state is None else json.loads(row[0].sink_state)
+
+ def get_analyzer_state(self, identifier: str) -> Optional[Dict[str, Any]]:
+ row = self._session.query(WorkflowTable.analyzer_state).filter(id=identifier).all()
+ return (
+ None if row[0].analyzer_state is None else json.loads(row[0].analyzer_state)
+ )
+
+ def add_workflow(self, workflow: Workflow) -> None:
+ self._session.add(
+ WorkflowTable(
+ id=workflow.id,
+ config=obj_to_json(workflow.config),
+ source_state=obj_to_json(workflow.states.source_state),
+ sink_state=obj_to_json(workflow.states.sink_state),
+ analyzer_state=obj_to_json(workflow.states.analyzer_state),
+ )
+ )
+ self._commit_transaction()
+
+ def update_workflow(self, workflow: Workflow) -> None:
+ self._session.query(WorkflowTable).filter_by(id=workflow.id).update(
+ {
+ WorkflowTable.config: obj_to_json(workflow.config),
+ WorkflowTable.source_state: obj_to_json(workflow.states.source_state),
+ WorkflowTable.sink_state: obj_to_json(workflow.states.sink_state),
+ WorkflowTable.analyzer_state: obj_to_json(
+ workflow.states.analyzer_state
+ ),
+ },
+ synchronize_session=False,
+ )
+ self._commit_transaction()
+
+ def update_workflow_state(self, workflow_id: str, workflow_state: WorkflowState) -> None:
+ self._session.query(WorkflowTable).filter_by(id=workflow_id).update(
+ {
+ WorkflowTable.source_state: obj_to_json(workflow_state.source_state),
+ WorkflowTable.sink_state: obj_to_json(workflow_state.sink_state),
+ WorkflowTable.analyzer_state: obj_to_json(
+ workflow_state.analyzer_state
+ ),
+ },
+ synchronize_session=False,
+ )
+ self._commit_transaction()
+
+ def update_source_state(self, workflow_id: str, state: Dict[str, Any]) -> None:
+ self._session.query(WorkflowTable).filter_by(id=workflow_id).update(
+ {WorkflowTable.source_state: obj_to_json(state)}, synchronize_session=False
+ )
+ self._commit_transaction()
+
+ def update_sink_state(self, workflow_id: str, state: Dict[str, Any]) -> None:
+ self._session.query(WorkflowTable).filter_by(id=workflow_id).update(
+ {WorkflowTable.sink_state: obj_to_json(state)}, synchronize_session=False
+ )
+ self._commit_transaction()
+
+ def update_analyzer_state(self, workflow_id: str, state: Dict[str, Any]) -> None:
+ self._session.query(WorkflowTable).filter_by(id=workflow_id).update(
+ {WorkflowTable.analyzer_state: obj_to_json(state)},
+ synchronize_session=False,
+ )
+ self._commit_transaction()
+
+ def delete_workflow(self, id: str) -> None:
+ self._session.query(WorkflowTable).filter_by(id=id).delete()
+ self._commit_transaction()
+
+ def _commit_transaction(self) -> Any:
+ try:
+ self._session.commit()
+ except Exception as ex:
+ logger.error(f"Transaction rollback: {ex.__cause__}")
+ # Rollback is important here otherwise self.session will be in inconsistent state and next call will fail
+ self._session.rollback()
+ raise ex
+
+ @staticmethod
+ def _convert_sql_row_to_workflow_state(row: Any) -> Optional[WorkflowState]:
+
+ if row is None:
+ return None
+
+ source_state_dict = (
+ None if row.source_state is None else json.loads(row.source_state)
+ )
+ sink_state_dict = None if row.sink_state is None else json.loads(row.sink_state)
+ analyzer_state_dict = (
+ None if row.analyzer_state is None else json.loads(row.analyzer_state)
+ )
+
+ workflow_states: Optional[WorkflowState] = None
+ if source_state_dict or sink_state_dict or analyzer_state_dict:
+ workflow_states = WorkflowState(
+ source_state=source_state_dict,
+ sink_state=sink_state_dict,
+ analyzer_state=analyzer_state_dict,
+ )
+
+ return workflow_states
+
+ @staticmethod
+ def _convert_sql_row_to_workflow_data(row: Any) -> Workflow:
+
+ config_dict = json.loads(row.config)
+ workflow = Workflow(
+ id=row.id,
+ config=WorkflowConfig(**config_dict),
+ states=WorkflowStore._convert_sql_row_to_workflow_state(row),
+ )
+ return workflow
diff --git a/obsei_module/obsei-master/obsei/workflow/workflow.py b/obsei_module/obsei-master/obsei/workflow/workflow.py
new file mode 100644
index 0000000000000000000000000000000000000000..efd79d36988b1820781ecdd2c7527ed082178f23
--- /dev/null
+++ b/obsei_module/obsei-master/obsei/workflow/workflow.py
@@ -0,0 +1,38 @@
+from typing import Any, Dict, Optional
+from uuid import uuid4
+
+from pydantic import BaseModel, Field
+
+from obsei.analyzer.base_analyzer import BaseAnalyzerConfig
+from obsei.sink.base_sink import BaseSinkConfig
+from obsei.source.base_source import BaseSourceConfig
+
+
+class WorkflowConfig(BaseModel):
+ source_config: Optional[BaseSourceConfig] = None
+ sink_config: Optional[BaseSinkConfig] = None
+ analyzer_config: Optional[BaseAnalyzerConfig] = None
+ time_in_seconds: Optional[int] = None
+
+ class Config:
+ arbitrary_types_allowed = True
+
+
+class WorkflowState(BaseModel):
+ source_state: Optional[Dict[str, Any]] = None
+ sink_state: Optional[Dict[str, Any]] = None
+ analyzer_state: Optional[Dict[str, Any]] = None
+
+ class Config:
+ arbitrary_types_allowed = True
+ response_model_exclude_unset = True
+
+
+class Workflow(BaseModel):
+ id: str = str(uuid4())
+ config: WorkflowConfig
+ states: WorkflowState = Field(WorkflowState())
+
+ class Config:
+ arbitrary_types_allowed = True
+ response_model_exclude_unset = True
diff --git a/obsei_module/obsei-master/pyproject.toml b/obsei_module/obsei-master/pyproject.toml
new file mode 100644
index 0000000000000000000000000000000000000000..5bc7e67dbbf32032c4ec7306fdba773e17620872
--- /dev/null
+++ b/obsei_module/obsei-master/pyproject.toml
@@ -0,0 +1,131 @@
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
+[project]
+dynamic = ["version"]
+requires-python = ">=3.8"
+name = "obsei"
+authors = [{ name = "Lalit Pagaria", email = "lalit.pagaria@oraika.com" }]
+maintainers = [
+ { name = "Lalit Pagaria", email = "lalit.pagaria@oraika.com" },
+ { name = "Girish Patel", email = "girish.patel@oraika.com" }
+]
+description = "Obsei is an automation tool for text analysis need"
+readme = "README.md"
+license = { text = "Apache Version 2.0" }
+
+keywords = [
+ "workflow",
+ "customer-support",
+ "customer-feedback",
+ "low-code",
+ "automation",
+ "cognitive-automation",
+ "social-listening",
+ "customer-feedback-analysis",
+ "customer-experience",
+ "market-research",
+ "nlp",
+ "oraika",
+ "obsei"
+]
+
+classifiers = [
+ "Development Status :: 2 - Pre-Alpha",
+ "Intended Audience :: Developers",
+ "Intended Audience :: Customer Service",
+ "Intended Audience :: Science/Research",
+ "Intended Audience :: Information Technology",
+ "License :: OSI Approved :: Apache Software License",
+ "Operating System :: OS Independent",
+ "Programming Language :: Python :: 3",
+ "Programming Language :: Python :: 3.8",
+ "Programming Language :: Python :: 3.9",
+ "Programming Language :: Python :: 3.10",
+ "Programming Language :: Python :: 3.11",
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
+ "Topic :: Software Development :: Libraries :: Python Modules",
+]
+
+dependencies = [
+ "pytz >= 2023.3.post1",
+ "pydantic >= 2.5.3",
+ "pydantic-settings >= 2.1.0",
+ "python-dateutil >= 2.8.2",
+ "SQLAlchemy >= 2.0.24",
+ "mmh3 >= 4.0.1",
+ "beautifulsoup4 >= 4.9.3",
+ "dateparser >= 1.2.0",
+ "requests >= 2.26.0",
+]
+
+
+[project.optional-dependencies]
+
+twitter-api = ["searchtweets-v2 >= 1.1.1"]
+google-play-scraper = ["google-play-scraper >= 1.2.4"]
+google-play-api = ["google-api-python-client >= 2.111.0"]
+app-store-scraper = ["app-store-reviews-reader >= 1.2"]
+reddit-scraper = ["reddit-rss-reader >= 1.3.2"]
+reddit-api = ["praw >= 7.7.1"]
+pandas = ["pandas >= 2.0.3"]
+google-news-scraper = ["GoogleNews >= 1.6.12"]
+facebook-api = ["python-facebook-api >= 0.15.0"]
+atlassian-api = ["atlassian-python-api >= 3.41.4"]
+elasticsearch = ["elasticsearch >= 8.11.1"]
+slack-api = ["slack-sdk >= 3.26.1"]
+
+source = [
+ "obsei[twitter-api,google-play-scraper,google-play-api,app-store-scraper]",
+ "obsei[reddit-scraper,reddit-api,pandas,google-news-scraper,facebook-api]",
+]
+
+sink = ["obsei[atlassian-api,elasticsearch,slack-api,pandas]"]
+
+analyzer = [
+ "torch >= 2.1.2",
+ "vaderSentiment >= 3.3.2",
+ "transformers >= 4.36.2",
+ "nltk >= 3.8.1",
+ "sentencepiece >= 0.1.99",
+ "presidio-analyzer >= 2.2.351",
+ "presidio-anonymizer >= 2.2.351",
+ "spacy >= 3.7.2",
+]
+
+dev = [
+ "pre-commit >= 2.20.0",
+ "black >= 22.10.0",
+ "mypy >= 0.991",
+ "types-requests",
+ "types-python-dateutil",
+ "types-PyYAML",
+ "types-dateparser",
+ "types-protobuf",
+ "types-pytz",
+ "pytest >= 7.2.0",
+ "pip-tools >= 6.10.0",
+ "coverage >= 6.5.0",
+]
+
+all = ["obsei[analyzer,source,sink]"]
+
+## GPL dependencies (these are optional)
+gpl = ["trafilatura >= 1.6.3"]
+
+[project.urls]
+repository = "https://github.com/obsei/obsei"
+homepage = "https://obsei.com"
+documentation = "https://obsei.com"
+changelog = "https://github.com/obsei/obsei/releases"
+
+[tool.hatch.build.targets.sdist]
+include = ["/obsei"]
+
+[tool.hatch.build.targets.wheel]
+packages = ["obsei"]
+
+[tool.hatch.version]
+path = "obsei/_version.py"
+
diff --git a/obsei_module/obsei-master/sample-ui/Dockerfile b/obsei_module/obsei-master/sample-ui/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..28f57d2ed7d39a985956290ad690cc3b0f1bf01b
--- /dev/null
+++ b/obsei_module/obsei-master/sample-ui/Dockerfile
@@ -0,0 +1,19 @@
+FROM python:3.10-slim-bullseye
+
+WORKDIR /home/user
+
+RUN apt-get update && apt-get install -y --no-install-recommends curl pkg-config cmake git g++ \
+ && apt-get clean autoclean && apt-get autoremove -y \
+ && rm -rf /var/lib/{apt,dpkg,cache,log}/
+
+COPY ui.py /home/user/
+COPY utils.py /home/user/
+COPY config.yaml /home/user/
+COPY requirements.txt /home/user/
+
+RUN pip install --upgrade pip
+RUN pip install --no-cache-dir -r requirements.txt
+
+EXPOSE 8501
+
+CMD ["streamlit", "run", "ui.py"]
diff --git a/obsei_module/obsei-master/sample-ui/README.md b/obsei_module/obsei-master/sample-ui/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..a16df6430ff4ed31b98c8e0a23f1c3f617ab395d
--- /dev/null
+++ b/obsei_module/obsei-master/sample-ui/README.md
@@ -0,0 +1,22 @@
+## Demo UI
+
+This is a minimal UI that can spin up to test Obsei. It's based on streamlit and is very easy to extend for your own use.
+
+![Screenshot](https://raw.githubusercontent.com/obsei/obsei-resources/master/images/obsei-ui-demo.png)
+
+## Usage
+
+### Option 1: Local
+Execute in this folder:
+```shell
+pip install -r requirements.txt
+streamlit run ui.py
+```
+
+### Option 2: Container
+
+Just run
+```
+docker run -d --name obesi-ui -p 8501:8501 obsei/obsei-ui-demo
+```
+You can find the UI at `http://localhost:8501`
diff --git a/obsei_module/obsei-master/sample-ui/config.yaml b/obsei_module/obsei-master/sample-ui/config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..18cc7433b879c600a0313d2cceff6ff05e4cc4e2
--- /dev/null
+++ b/obsei_module/obsei-master/sample-ui/config.yaml
@@ -0,0 +1,372 @@
+source:
+ Youtube Scrapper:
+ _icon_: "https://raw.githubusercontent.com/obsei/obsei-resources/master/logos/Youtube.png"
+ _help_:
+ - '`video url` is Youtube video url.'
+ source:
+ _target_: obsei.source.youtube_scrapper.YoutubeScrapperSource
+ config:
+ _target_: obsei.source.youtube_scrapper.YoutubeScrapperConfig
+ video_url: "https://www.youtube.com/watch?v=uZfns0JIlFk"
+ lookup_period: "1Y"
+ max_comments: 10
+ Appstore Scrapper:
+ _icon_: "https://raw.githubusercontent.com/obsei/obsei-resources/master/logos/appstore.png"
+ _help_:
+ - '`app_url` is application url on app store.
+ - For example for Xcode - https://apps.apple.com/us/app/xcode/id497799835'
+ source:
+ _target_: obsei.source.appstore_scrapper.AppStoreScrapperSource
+ config:
+ _target_: obsei.source.appstore_scrapper.AppStoreScrapperConfig
+ app_url: "https://apps.apple.com/us/app/gmail-email-by-google/id422689480"
+ lookup_period: "1h"
+ max_count: 5
+ Playstore Scrapper:
+ _icon_: "https://raw.githubusercontent.com/obsei/obsei-resources/master/logos/playstore.png"
+ _help_:
+ - '`app_url` is application url on play store'
+ - 'For example for Gmail - https://play.google.com/store/apps/details?id=com.google.android.gm&hl=en_IN&gl=US'
+ source:
+ _target_: obsei.source.playstore_scrapper.PlayStoreScrapperSource
+ config:
+ _target_: obsei.source.playstore_scrapper.PlayStoreScrapperConfig
+ app_url: "https://play.google.com/store/apps/details?id=com.google.android.gm&hl=en_IN&gl=US"
+ lookup_period: "1h"
+ max_count: 5
+ Maps Reviews Scrapper:
+ _icon_: "https://raw.githubusercontent.com/obsei/obsei-resources/master/logos/google_maps.png"
+ _help_:
+ - 'Collect `api_key` from https://outscraper.com/'
+ - ''
+ - 'For `queries` enter google maps urls or place ids, for example'
+ - "https://www.google.co.in/maps/place/Taj+Mahal/@27.1751496,78.0399535,17z/data=!4m5!3m4!1s0x39747121d702ff6d:0xdd2ae4803f767dde!8m2!3d27.1751448!4d78.0421422"
+ source:
+ _target_: obsei.source.google_maps_reviews.OSGoogleMapsReviewsSource
+ config:
+ _target_: obsei.source.google_maps_reviews.OSGoogleMapsReviewsConfig
+ api_key: ''
+ queries:
+ - "https://www.google.co.in/maps/place/Taj+Mahal/@27.1751496,78.0399535,17z/data=!4m5!3m4!1s0x39747121d702ff6d:0xdd2ae4803f767dde!8m2!3d27.1751448!4d78.0421422"
+ number_of_reviews: 5
+ Reddit Scrapper:
+ _icon_: "https://raw.githubusercontent.com/obsei/obsei-resources/master/logos/reddit.png"
+ _help_:
+ - 'Reddit subreddit, search etc rss url. For proper url refer following link -'
+ - 'https://www.reddit.com/r/pathogendavid/comments/tv8m9/pathogendavids_guide_to_rss_and_reddit/'
+ source:
+ _target_: obsei.source.reddit_source.RedditScrapperSource
+ config:
+ _target_: obsei.source.reddit_source.RedditScrapperConfig
+ url: 'https://www.reddit.com/r/wallstreetbets/comments/.rss?sort=new'
+ lookup_period: "1h"
+ Twitter:
+ _icon_: "https://raw.githubusercontent.com/obsei/obsei-resources/master/logos/twitter.png"
+ _help_:
+ - '`query` accept search string, @user or #hashtags also'
+ - ''
+ - 'Need twitter `consumer key` and `secret`, get it from https://developer.twitter.com/en/apply-for-access'
+ source:
+ _target_: obsei.source.twitter_source.TwitterSource
+ config:
+ _target_: obsei.source.twitter_source.TwitterSourceConfig
+ query: "@Twitter"
+ lookup_period: "1h"
+ max_tweets: 10
+ cred_info:
+ _target_: obsei.source.twitter_source.TwitterCredentials
+ consumer_key: ''
+ consumer_secret: ''
+ Facebook:
+ _icon_: "https://raw.githubusercontent.com/obsei/obsei-resources/master/logos/facebook.png"
+ _help_:
+ - '`page_id` is id of your facebook page'
+ - ''
+ - 'Need facebook app_id, app_secret and long_term_token. Get it from https://developers.facebook.com/apps/'
+ source:
+ _target_: obsei.source.facebook_source.FacebookSource
+ config:
+ _target_: obsei.source.facebook_source.FacebookSourceConfig
+ page_id: "110844591144719"
+ lookup_period: "1h"
+ cred_info:
+ _target_: obsei.source.facebook_source.FacebookCredentials
+ app_id: ''
+ app_secret: ''
+ long_term_token: ''
+ Email:
+ _icon_: "https://raw.githubusercontent.com/obsei/obsei-resources/master/logos/gmail.png"
+ _help_:
+ - 'List of IMAP servers for most commonly used email providers https://www.systoolsgroup.com/imap/'
+ - ''
+ - 'Also, if you are using a `Gmail` account then make sure you allow less secure apps on your account'
+ - 'https://myaccount.google.com/lesssecureapps?pli=1'
+ - 'Also enable IMAP access - https://mail.google.com/mail/u/0/#settings/fwdandpop'
+ source:
+ _target_: obsei.source.email_source.EmailSource
+ config:
+ _target_: obsei.source.email_source.EmailConfig
+ imap_server: 'imap.gmail.com'
+ cred_info:
+ _target_: obsei.source.email_source.EmailCredInfo
+ username: ''
+ password: ''
+ lookup_period: "1h"
+ Reddit:
+ _icon_: "https://raw.githubusercontent.com/obsei/obsei-resources/master/logos/reddit.png"
+ _help_:
+ - 'Reddit account `username` and `password` require'
+ - 'Enter list of `subreddits`'
+ source:
+ _target_: obsei.source.reddit_source.RedditSource
+ config:
+ _target_: obsei.source.reddit_source.RedditConfig
+ subreddits:
+ - 'wallstreetbets'
+ cred_info:
+ _target_: obsei.source.reddit_source.RedditCredInfo
+ username: ''
+ password: ''
+ lookup_period: "1h"
+ Google News:
+ _icon_: "https://raw.githubusercontent.com/obsei/obsei-resources/master/logos/googlenews.png"
+ _help_:
+ - '`fetch_article` use crawler to fetch full article'
+ source:
+ _target_: obsei.source.google_news_source.GoogleNewsSource
+ config:
+ _target_: obsei.source.google_news_source.GoogleNewsConfig
+ query: "bitcoin"
+ max_results: 3
+ lookup_period: "1d"
+ fetch_article: true
+ Website Crawler:
+ _icon_: "https://raw.githubusercontent.com/obsei/obsei-resources/master/logos/webcrawler.png"
+ _help_:
+ - '`package name` can be found at the end of the url of app in play store.'
+ - ''
+ - 'For example - https://play.google.com/store/apps/details?id=com.google.android.gm&hl=en&gl=US'
+ - '`com.google.android.gm` is the `package name` for xcode and `us` is `country`.'
+ source:
+ _target_: obsei.source.website_crawler_source.TrafilaturaCrawlerSource
+ config:
+ _target_: obsei.source.website_crawler_source.TrafilaturaCrawlerConfig
+ urls:
+ - 'https://obsei.github.io/obsei/'
+sink:
+ Panda Dataframe:
+ _icon_: "https://raw.githubusercontent.com/obsei/obsei-resources/master/logos/pandas.svg"
+ _help_: null
+ sink:
+ _target_: obsei.sink.pandas_sink.PandasSink
+ config:
+ _target_: obsei.sink.pandas_sink.PandasSinkConfig
+ Logger:
+ _icon_: "https://raw.githubusercontent.com/obsei/obsei-resources/master/logos/logger.png"
+ _help_: null
+ sink:
+ _target_: obsei.sink.logger_sink.LoggerSink
+ config:
+ _target_: obsei.sink.logger_sink.LoggerSinkConfig
+ Jira:
+ _icon_: "https://raw.githubusercontent.com/obsei/obsei-resources/master/logos/jira.png"
+ _help_:
+ - 'For testing purpose you can start jira server locally'
+ - 'Refer https://developer.atlassian.com/server/framework/atlassian-sdk/atlas-run-standalone/'
+ - ''
+ - 'Provide `server url`, `username` and `password` of the user'
+ - ''
+ - '`type` of issue to be created, for more information refer -'
+ - 'https://support.atlassian.com/jira-cloud-administration/docs/what-are-issue-types/'
+ - ''
+ - '`project` in which issue to be created, for more information refer -'
+ - 'https://support.atlassian.com/jira-software-cloud/docs/what-is-a-jira-software-project/'
+ sink:
+ _target_: obsei.sink.jira_sink.JiraSink
+ config:
+ _target_: obsei.sink.jira_sink.JiraSinkConfig
+ url: 'http://localhost:2990/jira'
+ username: ''
+ password: ''
+ issue_type:
+ name: "Task"
+ project:
+ key: ""
+ Zendesk:
+ _icon_: "https://raw.githubusercontent.com/obsei/obsei-resources/master/logos/zendesk.png"
+ _help_:
+ - 'For custom domain refer http://docs.facetoe.com.au/zenpy.html#custom-domains'
+ - 'Provide zendesk `domain`'
+ - ''
+ - 'Provide `subdomain` if you have one'
+ - ''
+ - 'Provide zendesk account `email` and `password`'
+ sink:
+ _target_: obsei.sink.zendesk_sink.ZendeskSink
+ config:
+ _target_: obsei.sink.zendesk_sink.ZendeskSinkConfig
+ domain: "zendesk.com"
+ subdomain: null
+ cred_info:
+ _target_: obsei.sink.zendesk_sink.ZendeskCredInfo
+ email: ''
+ password: ''
+ Slack:
+ _icon_: "https://raw.githubusercontent.com/obsei/obsei-resources/master/logos/slack.svg"
+ _help_:
+ - 'Provide slack bot/app `token`, for more detail refer -'
+ - 'https://slack.com/intl/en-de/help/articles/215770388-Create-and-regenerate-API-tokens'
+ - ''
+ - 'To get `channel id` refer -'
+ - 'https://stackoverflow.com/questions/40940327/what-is-the-simplest-way-to-find-a-slack-team-id-and-a-channel-id'
+ sink:
+ _target_: obsei.sink.slack_sink.SlackSink
+ config:
+ _target_: obsei.sink.slack_sink.SlackSinkConfig
+ slack_token: ''
+ channel_id: ''
+ jinja_template: |
+ :bell: Hi there!, a new notification by *Obsei*
+ >Content:
+ ```
+ {%- for key, value in payload.items() recursive%}
+ {%- if value is mapping -%}
+ {{loop(value.items())}}
+ {%- else %}
+ {{key}}: {{value}}
+ {%- endif %}
+ {%- endfor%}
+ ```
+ Elastic:
+ _icon_: "https://raw.githubusercontent.com/obsei/obsei-resources/master/logos/elastic.png"
+ _help_:
+ - 'For testing purpose you can start Elasticsearch server locally via docker'
+ - '`docker run -d --name elasticsearch -p 9200:9200 -e "discovery.type=single-node" elasticsearch:7.9.2`'
+ - ''
+ - ' Provide server `hostname`, `port` along with `index` to be used'
+ sink:
+ _target_: obsei.sink.elasticsearch_sink.ElasticSearchSink
+ config:
+ _target_: obsei.sink.elasticsearch_sink.ElasticSearchSinkConfig
+ host: "localhost"
+ port: 9200
+ index_name: "test"
+ Http:
+ _icon_: "https://raw.githubusercontent.com/obsei/obsei-resources/master/logos/http_api.png"
+ _help_:
+ - 'For testing purpose you can create mock http server via postman, refer -'
+ - 'https://learning.postman.com/docs/designing-and-developing-your-api/mocking-data/setting-up-mock/'
+ - ''
+ - 'Provide http server `url` and `headers`'
+ sink:
+ _target_: obsei.sink.http_sink.HttpSink
+ config:
+ _target_: obsei.sink.http_sink.HttpSinkConfig
+ url: 'https://localhost:8080/api/path'
+ headers:
+ Content-type: "application/json"
+analyzer:
+ Sentiment:
+ _icon_: "https://raw.githubusercontent.com/obsei/obsei-resources/master/logos/sentiment.png"
+ Transformer:
+ _help_:
+ - 'For supported models refer https://huggingface.co/models?filter=zero-shot-classification'
+ - ''
+ - 'Possible device values are `auto` (cuda:0 if available otherwise cpu), `cpu` and `cuda:{id}` (cuda device id)'
+ config:
+ _target_: obsei.analyzer.sentiment_analyzer.TransformersSentimentAnalyzerConfig
+ labels:
+ - "positive"
+ - "negative"
+ multi_class_classification: false
+ analyzer:
+ _target_: obsei.analyzer.sentiment_analyzer.TransformersSentimentAnalyzer
+ model_name_or_path: "typeform/mobilebert-uncased-mnli"
+ device: "auto"
+ Vader:
+ _help_:
+ - 'Vader is less resource hungry dictionary based Vader Sentiment detector'
+ analyzer:
+ _target_: obsei.analyzer.sentiment_analyzer.VaderSentimentAnalyzer
+ Classification:
+ _icon_: "https://raw.githubusercontent.com/obsei/obsei-resources/master/logos/classification.png"
+ Transformer:
+ _help_:
+ - 'For supported models refer https://huggingface.co/models?filter=zero-shot-classification'
+ - ''
+ - 'Provide classification `labels`, two labels "positive" and "negative" are added by default'
+ - ''
+ - 'Possible device values are `auto` (cuda:0 if available otherwise cpu), `cpu` and `cuda:{id}` (cuda device id)'
+ config:
+ _target_: obsei.analyzer.classification_analyzer.ClassificationAnalyzerConfig
+ labels:
+ - "service"
+ - "content"
+ - "interface"
+ multi_class_classification: true
+ analyzer:
+ _target_: obsei.analyzer.classification_analyzer.ZeroShotClassificationAnalyzer
+ model_name_or_path: "typeform/mobilebert-uncased-mnli"
+ device: "auto"
+ Named Entity Recognition:
+ _icon_: "https://raw.githubusercontent.com/obsei/obsei-resources/master/logos/ner.png"
+ Transformer:
+ _help_:
+ - 'For supported models refer https://huggingface.co/models?filter=token-classification'
+ - ''
+ - 'Possible device values are `auto` (cuda:0 if available otherwise cpu), `cpu` and `cuda:{id}` (cuda device id)'
+ analyzer:
+ _target_: obsei.analyzer.ner_analyzer.TransformersNERAnalyzer
+ model_name_or_path: "elastic/distilbert-base-cased-finetuned-conll03-english"
+ device: "auto"
+ Spacy:
+ _help_:
+ - 'For supported models refer https://spacy.io/models'
+ analyzer:
+ _target_: obsei.analyzer.ner_analyzer.SpacyNERAnalyzer
+ model_name_or_path: "en_core_web_sm"
+ Translation:
+ _icon_: "https://raw.githubusercontent.com/obsei/obsei-resources/master/logos/translator.png"
+ Transformer:
+ _help_:
+ - 'For supported models refer https://huggingface.co/models?pipeline_tag=translation'
+ - ''
+ - 'Possible device values are `auto` (cuda:0 if available otherwise cpu), `cpu` and `cuda:{id}` (cuda device id)'
+ analyzer:
+ _target_: obsei.analyzer.translation_analyzer.TranslationAnalyzer
+ model_name_or_path: "Helsinki-NLP/opus-mt-en-hi"
+ device: "auto"
+ PII Anonymizer:
+ _icon_: "https://raw.githubusercontent.com/obsei/obsei-resources/master/logos/pii.png"
+ Presidio:
+ _help_:
+ - '`analyze_only` decide whether to return only pii analysis or anonymize text'
+ - ''
+ - '`return_decision_process` decide whether to return detail information about anonymization decision'
+ - ''
+ - 'For `nlp_engine_name` spacy and stanza nlp engines are supported, For more info refer -'
+ - 'https://microsoft.github.io/presidio/analyzer/developing_recognizers/#utilize-spacy-or-stanza'
+ - ''
+ - 'Provide `model_name` and `lang_code` of the model'
+ config:
+ _target_: obsei.analyzer.pii_analyzer.PresidioPIIAnalyzerConfig
+ analyze_only: false
+ return_decision_process: false
+ analyzer:
+ _target_: obsei.analyzer.pii_analyzer.PresidioPIIAnalyzer
+ engine_config:
+ _target_: obsei.analyzer.pii_analyzer.PresidioEngineConfig
+ nlp_engine_name: "spacy"
+ models:
+ - _target_: obsei.analyzer.pii_analyzer.PresidioModelConfig
+ model_name: "en_core_web_md"
+ lang_code: "en"
+ Dummy:
+ _icon_: "https://raw.githubusercontent.com/obsei/obsei-resources/master/logos/dummy.png"
+ Dummy:
+ _help_:
+ - 'Dummy Analyzer, do nothing it simply used for transforming input to output'
+ config:
+ _target_: obsei.analyzer.dummy_analyzer.DummyAnalyzerConfig
+ analyzer:
+ _target_: obsei.analyzer.dummy_analyzer.DummyAnalyzer
diff --git a/obsei_module/obsei-master/sample-ui/requirements.txt b/obsei_module/obsei-master/sample-ui/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..f5bd699bdd5d799a2f7ae1f47386fd1e49181ad5
--- /dev/null
+++ b/obsei_module/obsei-master/sample-ui/requirements.txt
@@ -0,0 +1,4 @@
+git+https://github.com/obsei/obsei@master#egg=obsei[all]
+streamlit
+trafilatura
+tornado>=6.3.2 # not directly required, pinned by Snyk to avoid a vulnerability
diff --git a/obsei_module/obsei-master/sample-ui/ui.py b/obsei_module/obsei-master/sample-ui/ui.py
new file mode 100644
index 0000000000000000000000000000000000000000..ce07ce6fb237f9daf68283f87f71d9b3cd6b6da3
--- /dev/null
+++ b/obsei_module/obsei-master/sample-ui/ui.py
@@ -0,0 +1,96 @@
+from utils import *
+
+current_path = pathlib.Path(__file__).parent.absolute().as_posix()
+configuration = get_obsei_config(current_path, "config.yaml")
+logo_url = "https://raw.githubusercontent.com/obsei/obsei-resources/master/logos/obsei_200x200.png"
+
+st.set_page_config(page_title="Obsei Demo", layout="wide", page_icon=logo_url)
+
+st.title("Obsei Demo").markdown(
+ get_icon_name("Obsei Demo", logo_url, 60, 35), unsafe_allow_html=True
+)
+
+st.success(
+ """
+Please ⭐ the repo and share the feedback at https://github.com/obsei/obsei?utm_source=streamlit
+ """
+)
+st.warning(
+ """
+**Note:** Demo run will require some secure information based on source or sink selected,
+if you don't trust this environment please close the app.
+"""
+)
+
+(
+ pipeline_col,
+ spinner_col,
+ execute_col,
+ download_python_col,
+ download_yaml_col,
+) = st.columns([2, 2, 1, 1, 1])
+
+col_map = dict()
+col_map["source"], col_map["analyzer"], col_map["sink"] = st.columns([1, 1, 1])
+
+selected = {}
+name_map = {"source": "Observer", "analyzer": "Analyzer", "sink": "Informer"}
+
+for node_name, col in col_map.items():
+ item_list = [k for k in configuration[node_name].keys()]
+ selected[node_name] = col.selectbox(f"Select {name_map[node_name]}", item_list)
+
+icons = [get_icon_name(None, configuration[k][v]["_icon_"]) for k, v in selected.items()]
+pipeline_col.header("Pipeline").markdown(
+ f"**Pipeline:** {icons[0]} ➡➡ {icons[1]} ➡➡ {icons[2]}",
+ unsafe_allow_html=True,
+)
+
+generate_config = {}
+log_component = {}
+for node_name, node_value in selected.items():
+ type_config = configuration[node_name][node_value]
+ if node_name == "analyzer":
+ type_list = []
+ for config_key in type_config.keys():
+ if config_key != "_icon_":
+ type_list.append(config_key)
+ selected_type = col_map[node_name].selectbox(f"{name_map[node_name]} Type", type_list)
+ type_config = type_config[selected_type]
+
+ config = None
+ if "config" in type_config:
+ config = type_config["config"]
+ if type_config["_help_"] is not None:
+ with col_map[node_name].expander("Config Help Info", False):
+ help_area = "\n".join(type_config["_help_"])
+ st.code(f"{help_area}")
+
+ config_expander = None
+ if config is not None:
+ config_expander = col_map[node_name].expander(f"Configure {name_map[node_name]}", False)
+ render_config(config, config_expander)
+
+ if node_name == "analyzer" and node_name in type_config and len(type_config[node_name]) > 1:
+ config_expander = config_expander or col_map[node_name].expander(f"Configure {name_map[node_name]}", False)
+ render_config(type_config["analyzer"], config_expander)
+
+ generate_config[node_name] = type_config[node_name]
+ generate_config[f"{node_name}_config"] = config
+
+ log_expander = col_map[node_name].expander(f"{name_map[node_name]} Logs", True)
+ log_component[node_name] = log_expander.empty()
+ log_component[node_name].write("Run \"🚀 Execute\" first")
+
+python_code = generate_python(generate_config)
+yaml_code = generate_yaml(generate_config)
+
+execute_button = execute_col.button("🚀 Execute")
+if execute_button:
+ execute_workflow(generate_config, spinner_col, log_component)
+
+with download_python_col:
+ download_button(python_code, "generated-code.py", "🐍 Download (.py)")
+
+with download_yaml_col:
+ download_button(yaml_code, "generated-config.yaml", "📖 Download (.yaml)")
diff --git a/obsei_module/obsei-master/sample-ui/utils.py b/obsei_module/obsei-master/sample-ui/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..9dc10c38f86f86bebd6bb9e7fdd83af917bc6ed1
--- /dev/null
+++ b/obsei_module/obsei-master/sample-ui/utils.py
@@ -0,0 +1,216 @@
+import base64
+import logging
+import pathlib
+import re
+import sys
+import uuid
+
+import streamlit as st
+import yaml
+
+from obsei.configuration import ObseiConfiguration
+
+logger = logging.getLogger(__name__)
+logging.basicConfig(stream=sys.stdout, level=logging.INFO)
+
+
+def img_to_bytes(img_path):
+ img_bytes = pathlib.Path(img_path).read_bytes()
+ encoded = base64.b64encode(img_bytes).decode()
+ return encoded
+
+
+# Copied from https://github.com/jrieke/traingenerator/blob/main/app/utils.py
+def download_button(
+ object_to_download, download_filename, button_text # , pickle_it=False
+):
+ try:
+ # some strings <-> bytes conversions necessary here
+ b64 = base64.b64encode(object_to_download.encode()).decode()
+ except AttributeError as e:
+ b64 = base64.b64encode(object_to_download).decode()
+
+ button_uuid = str(uuid.uuid4()).replace("-", "")
+ button_id = re.sub("\d+", "", button_uuid)
+
+ custom_css = f"""
+ """
+
+ dl_link = (
+ custom_css
+ + f'{button_text}