Spaces:

kkastr
/

summit

Runtime error

App Files Files Community

kkastr commited on Feb 14, 2023

Commit

95e1833

•

2 Parent(s): 8c58a7e 00320ff

Merge branch 'master' of github.com:kkastr/summit into main

Browse files

Files changed (4) hide show

.gitignore +135 -0
gradio-demo.py +9 -0
scraper.py +51 -0
thread_analyzer.py +57 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,135 @@

+data/
+model/
+archive/
+api_keys.py
+*.csv
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+.python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/

gradio-demo.py ADDED Viewed

	@@ -0,0 +1,9 @@

+import gradio as gr
+def greet(name):
+    return "Hello " + name + "!"
+demo = gr.Interface(fn=greet, inputs="text", outputs="text", css=".primary-button {background-color: cyan")
+demo.launch()

scraper.py ADDED Viewed

	@@ -0,0 +1,51 @@

+import praw
+import pandas as pd
+from tqdm import tqdm
+from api_keys import client_id, client_secret, user_agent, username
+def getComments(url):
+    cols = [
+        "text",
+        "score",
+        "id",
+        "parent_id",
+        "submission_title",
+        "submission_score",
+        "submission_id"
+    ]
+    reddit = praw.Reddit(
+        client_id=client_id, client_secret=client_secret, user_agent=user_agent, username=username
+    )
+    submission = reddit.submission(url=url)
+    submission.comments.replace_more(limit=0)
+    rows = []
+    for comment in submission.comments.list():
+        if comment.stickied:
+            continue
+        data = [
+            comment.body,
+            comment.score,
+            comment.id,
+            comment.parent_id,
+            submission.title,
+            submission.score,
+            submission.id,
+        ]
+        rows.append(data)
+    df = pd.DataFrame(data=rows, columns=cols)
+    df.to_csv(f'{submission.id}_comments.csv', index=False)
+    return df
+if __name__ == "__main__":
+    pass

thread_analyzer.py ADDED Viewed

	@@ -0,0 +1,57 @@

+import pandas as pd
+from transformers import pipeline
+from scraper import getComments
+def main():
+    # durl = "https://www.reddit.com/r/ask/comments/111591k/who_do_you_think_will_start_ww3/"
+    # here you would probably check if the post id already exists in some DB so that you don't have to refetch comments.
+    # if pushshift.io submission comments api starts working again, could probably make this all realtime.
+    # df = getComments(url=durl)
+    nlp = pipeline('summarization')
+    df = pd.read_csv("111591k_comments.csv")
+    gb = df.groupby("parent_id")
+    for key, grp in gb:
+        summary = nlp(grp.text.str.cat(), max_length=500)[0]["summary_text"]
+        print(summary)
+        break
+if __name__ == "__main__":
+    #     ldf = pd.read_csv('news_comments.csv')
+    # pid = ldf.post_id.unique()[0]
+    # df = ldf[ldf.post_id == pid]
+    # txt = df.body[0:2].str.cat()
+    # nlp = pipeline('summarization')
+    # df.body[4] = ''
+    # text = ' '.join(df.body[0:32])
+    # summary1 = nlp(text, max_length=500)[0]["summary_text"]
+    # text = ' '.join(df.body[33:60])
+    # summary2 = nlp(text, max_length=500)[0]["summary_text"]
+    # text = ' '.join(df.body[61:90])
+    # summary3 = nlp(text, max_length=500)[0]["summary_text"]
+    # summary = summary1 + ' ' + summary2 + ' ' + summary3
+    # nsum = nlp(summary, max_length=500)[0]["summary_text"]
+    # print ("Original Text")
+    # print(summary)
+    # print("Summarised")
+    main()