Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -6,6 +6,7 @@ import json
|
|
6 |
import concurrent.futures
|
7 |
from concurrent.futures import ThreadPoolExecutor
|
8 |
from langchain_community.document_loaders import PyPDFLoader
|
|
|
9 |
from langdetect import detect_langs
|
10 |
from PyPDF2 import PdfReader
|
11 |
from io import BytesIO
|
@@ -65,7 +66,7 @@ def download_pdf(url, timeout=10):
|
|
65 |
logging.error(f"PDF download error: {e}")
|
66 |
return None
|
67 |
|
68 |
-
def
|
69 |
reader = PdfReader(pdf_file)
|
70 |
extracted_text = ""
|
71 |
try:
|
@@ -80,19 +81,35 @@ def extract_text_from_pages(pdf_file, pages):
|
|
80 |
logging.error(f"PDF text extraction error: {e}")
|
81 |
return 'हे चालत नाही'
|
82 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
83 |
def process_link(link, similar_product):
|
84 |
if link in seen:
|
85 |
return None
|
86 |
seen.add(link)
|
87 |
try:
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
|
|
|
|
|
|
96 |
return None
|
97 |
|
98 |
def filtering(urls, similar_product):
|
|
|
6 |
import concurrent.futures
|
7 |
from concurrent.futures import ThreadPoolExecutor
|
8 |
from langchain_community.document_loaders import PyPDFLoader
|
9 |
+
from langchain_community.document_loaders import WebBaseLoader
|
10 |
from langdetect import detect_langs
|
11 |
from PyPDF2 import PdfReader
|
12 |
from io import BytesIO
|
|
|
66 |
logging.error(f"PDF download error: {e}")
|
67 |
return None
|
68 |
|
69 |
+
def extract_text_from_pdf(pdf_file, pages):
|
70 |
reader = PdfReader(pdf_file)
|
71 |
extracted_text = ""
|
72 |
try:
|
|
|
81 |
logging.error(f"PDF text extraction error: {e}")
|
82 |
return 'हे चालत नाही'
|
83 |
|
84 |
+
def extract_text_online(link):
|
85 |
+
|
86 |
+
loader = WebBaseLoader(link)
|
87 |
+
pages = loader.load_and_split()
|
88 |
+
|
89 |
+
text = ''
|
90 |
+
|
91 |
+
for page in pages[:3]:
|
92 |
+
text+=page.page_content
|
93 |
+
|
94 |
+
return text
|
95 |
+
|
96 |
+
|
97 |
def process_link(link, similar_product):
|
98 |
if link in seen:
|
99 |
return None
|
100 |
seen.add(link)
|
101 |
try:
|
102 |
+
if link[-3:]=='.md':
|
103 |
+
text = extract_text_online(link)
|
104 |
+
else:
|
105 |
+
pdf_file = download_pdf(link)
|
106 |
+
text = extract_text_from_pdf(pdf_file, [0, 2, 4])
|
107 |
+
|
108 |
+
if language_preprocess(text):
|
109 |
+
if relevant(main_product, similar_product, text):
|
110 |
+
return link
|
111 |
+
except:
|
112 |
+
pass
|
113 |
return None
|
114 |
|
115 |
def filtering(urls, similar_product):
|