kkastr commited on
Commit
95e1833
2 Parent(s): 8c58a7e 00320ff

Merge branch 'master' of github.com:kkastr/summit into main

Browse files
Files changed (4) hide show
  1. .gitignore +135 -0
  2. gradio-demo.py +9 -0
  3. scraper.py +51 -0
  4. thread_analyzer.py +57 -0
.gitignore ADDED
@@ -0,0 +1,135 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ data/
2
+ model/
3
+ archive/
4
+ api_keys.py
5
+ *.csv
6
+
7
+ # Byte-compiled / optimized / DLL files
8
+ __pycache__/
9
+ *.py[cod]
10
+ *$py.class
11
+
12
+ # C extensions
13
+ *.so
14
+
15
+ # Distribution / packaging
16
+ .Python
17
+ build/
18
+ develop-eggs/
19
+ dist/
20
+ downloads/
21
+ eggs/
22
+ .eggs/
23
+ lib/
24
+ lib64/
25
+ parts/
26
+ sdist/
27
+ var/
28
+ wheels/
29
+ pip-wheel-metadata/
30
+ share/python-wheels/
31
+ *.egg-info/
32
+ .installed.cfg
33
+ *.egg
34
+ MANIFEST
35
+
36
+ # PyInstaller
37
+ # Usually these files are written by a python script from a template
38
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
39
+ *.manifest
40
+ *.spec
41
+
42
+ # Installer logs
43
+ pip-log.txt
44
+ pip-delete-this-directory.txt
45
+
46
+ # Unit test / coverage reports
47
+ htmlcov/
48
+ .tox/
49
+ .nox/
50
+ .coverage
51
+ .coverage.*
52
+ .cache
53
+ nosetests.xml
54
+ coverage.xml
55
+ *.cover
56
+ *.py,cover
57
+ .hypothesis/
58
+ .pytest_cache/
59
+
60
+ # Translations
61
+ *.mo
62
+ *.pot
63
+
64
+ # Django stuff:
65
+ *.log
66
+ local_settings.py
67
+ db.sqlite3
68
+ db.sqlite3-journal
69
+
70
+ # Flask stuff:
71
+ instance/
72
+ .webassets-cache
73
+
74
+ # Scrapy stuff:
75
+ .scrapy
76
+
77
+ # Sphinx documentation
78
+ docs/_build/
79
+
80
+ # PyBuilder
81
+ target/
82
+
83
+ # Jupyter Notebook
84
+ .ipynb_checkpoints
85
+
86
+ # IPython
87
+ profile_default/
88
+ ipython_config.py
89
+
90
+ # pyenv
91
+ .python-version
92
+
93
+ # pipenv
94
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
95
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
96
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
97
+ # install all needed dependencies.
98
+ #Pipfile.lock
99
+
100
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow
101
+ __pypackages__/
102
+
103
+ # Celery stuff
104
+ celerybeat-schedule
105
+ celerybeat.pid
106
+
107
+ # SageMath parsed files
108
+ *.sage.py
109
+
110
+ # Environments
111
+ .env
112
+ .venv
113
+ env/
114
+ venv/
115
+ ENV/
116
+ env.bak/
117
+ venv.bak/
118
+
119
+ # Spyder project settings
120
+ .spyderproject
121
+ .spyproject
122
+
123
+ # Rope project settings
124
+ .ropeproject
125
+
126
+ # mkdocs documentation
127
+ /site
128
+
129
+ # mypy
130
+ .mypy_cache/
131
+ .dmypy.json
132
+ dmypy.json
133
+
134
+ # Pyre type checker
135
+ .pyre/
gradio-demo.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+
3
+
4
+ def greet(name):
5
+ return "Hello " + name + "!"
6
+
7
+ demo = gr.Interface(fn=greet, inputs="text", outputs="text", css=".primary-button {background-color: cyan")
8
+
9
+ demo.launch()
scraper.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import praw
2
+ import pandas as pd
3
+ from tqdm import tqdm
4
+ from api_keys import client_id, client_secret, user_agent, username
5
+
6
+
7
+ def getComments(url):
8
+
9
+ cols = [
10
+ "text",
11
+ "score",
12
+ "id",
13
+ "parent_id",
14
+ "submission_title",
15
+ "submission_score",
16
+ "submission_id"
17
+ ]
18
+
19
+ reddit = praw.Reddit(
20
+ client_id=client_id, client_secret=client_secret, user_agent=user_agent, username=username
21
+ )
22
+
23
+ submission = reddit.submission(url=url)
24
+ submission.comments.replace_more(limit=0)
25
+ rows = []
26
+
27
+ for comment in submission.comments.list():
28
+
29
+ if comment.stickied:
30
+ continue
31
+
32
+ data = [
33
+ comment.body,
34
+ comment.score,
35
+ comment.id,
36
+ comment.parent_id,
37
+ submission.title,
38
+ submission.score,
39
+ submission.id,
40
+ ]
41
+
42
+ rows.append(data)
43
+
44
+ df = pd.DataFrame(data=rows, columns=cols)
45
+
46
+ df.to_csv(f'{submission.id}_comments.csv', index=False)
47
+
48
+ return df
49
+
50
+ if __name__ == "__main__":
51
+ pass
thread_analyzer.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from transformers import pipeline
3
+ from scraper import getComments
4
+
5
+
6
+ def main():
7
+
8
+ # durl = "https://www.reddit.com/r/ask/comments/111591k/who_do_you_think_will_start_ww3/"
9
+
10
+ # here you would probably check if the post id already exists in some DB so that you don't have to refetch comments.
11
+ # if pushshift.io submission comments api starts working again, could probably make this all realtime.
12
+
13
+ # df = getComments(url=durl)
14
+ nlp = pipeline('summarization')
15
+ df = pd.read_csv("111591k_comments.csv")
16
+ gb = df.groupby("parent_id")
17
+
18
+ for key, grp in gb:
19
+
20
+ summary = nlp(grp.text.str.cat(), max_length=500)[0]["summary_text"]
21
+
22
+ print(summary)
23
+ break
24
+
25
+ if __name__ == "__main__":
26
+ # ldf = pd.read_csv('news_comments.csv')
27
+
28
+ # pid = ldf.post_id.unique()[0]
29
+
30
+ # df = ldf[ldf.post_id == pid]
31
+
32
+ # txt = df.body[0:2].str.cat()
33
+
34
+ # nlp = pipeline('summarization')
35
+ # df.body[4] = ''
36
+
37
+ # text = ' '.join(df.body[0:32])
38
+ # summary1 = nlp(text, max_length=500)[0]["summary_text"]
39
+
40
+ # text = ' '.join(df.body[33:60])
41
+ # summary2 = nlp(text, max_length=500)[0]["summary_text"]
42
+
43
+ # text = ' '.join(df.body[61:90])
44
+ # summary3 = nlp(text, max_length=500)[0]["summary_text"]
45
+
46
+
47
+ # summary = summary1 + ' ' + summary2 + ' ' + summary3
48
+
49
+ # nsum = nlp(summary, max_length=500)[0]["summary_text"]
50
+
51
+ # print ("Original Text")
52
+
53
+ # print(summary)
54
+
55
+ # print("Summarised")
56
+
57
+ main()