Spaces:
Runtime error
Runtime error
clean html text and increase char limit on content
Browse files- app.py +2 -2
- plagiarism.py +22 -4
- requirements.txt +3 -1
app.py
CHANGED
@@ -220,7 +220,7 @@ def ai_check(text: str, option: str):
|
|
220 |
|
221 |
def generate_prompt(settings: Dict[str, str]) -> str:
|
222 |
content_string = "\n".join(
|
223 |
-
f"{url.strip()}: \n{content.strip()[:
|
224 |
)
|
225 |
|
226 |
prompt = f"""
|
@@ -256,7 +256,7 @@ def generate_prompt(settings: Dict[str, str]) -> str:
|
|
256 |
|
257 |
def regenerate_prompt(settings: Dict[str, str]) -> str:
|
258 |
content_string = "\n".join(
|
259 |
-
f"{url.strip()}: \n{content.strip()[:
|
260 |
)
|
261 |
|
262 |
prompt = f"""
|
|
|
220 |
|
221 |
def generate_prompt(settings: Dict[str, str]) -> str:
|
222 |
content_string = "\n".join(
|
223 |
+
f"{url.strip()}: \n{content.strip()[:2000]}" for url, content in settings["sources"].items()
|
224 |
)
|
225 |
|
226 |
prompt = f"""
|
|
|
256 |
|
257 |
def regenerate_prompt(settings: Dict[str, str]) -> str:
|
258 |
content_string = "\n".join(
|
259 |
+
f"{url.strip()}: \n{content.strip()[:2000]}" for url, content in settings["sources"].items()
|
260 |
)
|
261 |
|
262 |
prompt = f"""
|
plagiarism.py
CHANGED
@@ -3,6 +3,21 @@ from googleapiclient.discovery import build
|
|
3 |
import asyncio
|
4 |
import httpx
|
5 |
from bs4 import BeautifulSoup
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
|
7 |
|
8 |
months = {
|
@@ -55,13 +70,10 @@ def google_search_urls(
|
|
55 |
**kwargs,
|
56 |
):
|
57 |
service = build("customsearch", "v1", developerKey=api_key)
|
58 |
-
num_pages = 3
|
59 |
results = service.cse().list(q=text, cx=cse_id, sort=sorted_date, **kwargs).execute()
|
60 |
url_list = []
|
61 |
if "items" in results and len(results["items"]) > 0:
|
62 |
for count, link in enumerate(results["items"]):
|
63 |
-
if count >= num_pages:
|
64 |
-
break
|
65 |
# skip user selected domains
|
66 |
if (domains_to_skip is not None) and any(("." + domain) in link["link"] for domain in domains_to_skip):
|
67 |
continue
|
@@ -100,9 +112,15 @@ def google_search(
|
|
100 |
soups = asyncio.run(parallel_scrap(url_list))
|
101 |
print("SCRAPING PROCESSING TIME: ", time.perf_counter() - start_time)
|
102 |
result_content = {}
|
|
|
|
|
103 |
for url, soup in zip(url_list, soups):
|
|
|
|
|
104 |
if soup:
|
105 |
-
|
|
|
|
|
106 |
# for key, value in result_content.items():
|
107 |
# print("-------------------URL: ", key)
|
108 |
# print(value[:30])
|
|
|
3 |
import asyncio
|
4 |
import httpx
|
5 |
from bs4 import BeautifulSoup
|
6 |
+
import justext
|
7 |
+
import newspaper
|
8 |
+
|
9 |
+
|
10 |
+
def clean_html(text):
|
11 |
+
result = ""
|
12 |
+
article = newspaper.Article(url=" ")
|
13 |
+
article.set_html(text)
|
14 |
+
article.parse()
|
15 |
+
result += article.title + "\n"
|
16 |
+
paragraphs = justext.justext(text, justext.get_stoplist("English"))
|
17 |
+
for paragraph in paragraphs:
|
18 |
+
if not paragraph.is_boilerplate:
|
19 |
+
result += paragraph.text
|
20 |
+
return result
|
21 |
|
22 |
|
23 |
months = {
|
|
|
70 |
**kwargs,
|
71 |
):
|
72 |
service = build("customsearch", "v1", developerKey=api_key)
|
|
|
73 |
results = service.cse().list(q=text, cx=cse_id, sort=sorted_date, **kwargs).execute()
|
74 |
url_list = []
|
75 |
if "items" in results and len(results["items"]) > 0:
|
76 |
for count, link in enumerate(results["items"]):
|
|
|
|
|
77 |
# skip user selected domains
|
78 |
if (domains_to_skip is not None) and any(("." + domain) in link["link"] for domain in domains_to_skip):
|
79 |
continue
|
|
|
112 |
soups = asyncio.run(parallel_scrap(url_list))
|
113 |
print("SCRAPING PROCESSING TIME: ", time.perf_counter() - start_time)
|
114 |
result_content = {}
|
115 |
+
num_pages = 3
|
116 |
+
count = 0
|
117 |
for url, soup in zip(url_list, soups):
|
118 |
+
if count >= num_pages:
|
119 |
+
break
|
120 |
if soup:
|
121 |
+
text = clean_html(soup.text)
|
122 |
+
result_content[url] = text
|
123 |
+
count += 1
|
124 |
# for key, value in result_content.items():
|
125 |
# print("-------------------URL: ", key)
|
126 |
# print(value[:30])
|
requirements.txt
CHANGED
@@ -10,4 +10,6 @@ language_tool_python
|
|
10 |
scipy
|
11 |
Unidecode
|
12 |
BeautifulSoup4
|
13 |
-
google-api-python-client
|
|
|
|
|
|
10 |
scipy
|
11 |
Unidecode
|
12 |
BeautifulSoup4
|
13 |
+
google-api-python-client
|
14 |
+
newspaper3k
|
15 |
+
jusText
|