Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -7,15 +7,51 @@ import re
|
|
7 |
import zipfile
|
8 |
import xml.etree.ElementTree as ET
|
9 |
import filetype
|
10 |
-
import requests
|
11 |
import os
|
12 |
import mimetypes
|
13 |
from bs4 import BeautifulSoup
|
14 |
-
|
15 |
|
16 |
# Constants
|
17 |
CHUNK_SIZE = 32000
|
18 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
19 |
# --- Utility Functions ---
|
20 |
|
21 |
def xml2text(xml):
|
@@ -213,7 +249,10 @@ def download_and_process_file(url, clean=True):
|
|
213 |
url = "http://" + url # Prepend "http://" if not present
|
214 |
|
215 |
try:
|
216 |
-
response =
|
|
|
|
|
|
|
217 |
original_filename = os.path.basename(url)
|
218 |
safe_filename = re.sub(r'[^\w\-_\. ]', '_', original_filename)
|
219 |
temp_filename = f"{safe_filename}"
|
@@ -224,8 +263,7 @@ def download_and_process_file(url, clean=True):
|
|
224 |
temp_filename += ext
|
225 |
|
226 |
with open(temp_filename, 'wb') as f:
|
227 |
-
|
228 |
-
f.write(chunk)
|
229 |
|
230 |
kind = filetype.guess(temp_filename)
|
231 |
if kind and kind.mime.startswith('image/'):
|
@@ -233,13 +271,9 @@ def download_and_process_file(url, clean=True):
|
|
233 |
else:
|
234 |
return read_document(temp_filename, clean, url) # Otherwise, process as a document
|
235 |
|
236 |
-
except
|
237 |
-
return "Error:
|
238 |
-
except
|
239 |
-
return "Error: Could not connect to the server. Please check your internet connection.", 0
|
240 |
-
except requests.exceptions.Timeout:
|
241 |
-
return "Error: Connection timed out while trying to fetch the URL.", 0
|
242 |
-
except requests.exceptions.RequestException as e:
|
243 |
return f"Error downloading file: {e}", 0
|
244 |
|
245 |
# --- Gradio Interface ---
|
|
|
7 |
import zipfile
|
8 |
import xml.etree.ElementTree as ET
|
9 |
import filetype
|
|
|
10 |
import os
|
11 |
import mimetypes
|
12 |
from bs4 import BeautifulSoup
|
13 |
+
import urllib3
|
14 |
|
15 |
# Constants
|
16 |
CHUNK_SIZE = 32000
|
17 |
|
18 |
+
# --- Custom HTTP Session and Response Classes ---
|
19 |
+
|
20 |
+
class CustomSession:
|
21 |
+
def __init__(self):
|
22 |
+
self.pool_manager = urllib3.PoolManager()
|
23 |
+
|
24 |
+
def get(self, url):
|
25 |
+
response = self.pool_manager.request('GET', url)
|
26 |
+
return CustomResponse(response)
|
27 |
+
|
28 |
+
class CustomResponse:
|
29 |
+
def __init__(self, response):
|
30 |
+
self.status_code = response.status
|
31 |
+
self.headers = response.headers
|
32 |
+
self.content = response.data
|
33 |
+
|
34 |
+
def json(self):
|
35 |
+
import json
|
36 |
+
return json.loads(self.content)
|
37 |
+
|
38 |
+
def text(self):
|
39 |
+
return self.content.decode('utf-8')
|
40 |
+
|
41 |
+
def soup(self):
|
42 |
+
return BeautifulSoup(self.content, 'lxml')
|
43 |
+
|
44 |
+
def clean_text(self):
|
45 |
+
soup = self.soup()
|
46 |
+
cleaned_text = soup.get_text().replace('\n', ' ').replace('\r', ' ').replace(' ', ' ')
|
47 |
+
while ' ' in cleaned_text:
|
48 |
+
cleaned_text = cleaned_text.replace(' ', ' ')
|
49 |
+
return cleaned_text.strip()
|
50 |
+
|
51 |
+
def get(url):
|
52 |
+
session = CustomSession()
|
53 |
+
return session.get(url)
|
54 |
+
|
55 |
# --- Utility Functions ---
|
56 |
|
57 |
def xml2text(xml):
|
|
|
249 |
url = "http://" + url # Prepend "http://" if not present
|
250 |
|
251 |
try:
|
252 |
+
response = get(url)
|
253 |
+
if response.status_code != 200:
|
254 |
+
return f"Error: Received status code {response.status_code} from the server.", 0
|
255 |
+
|
256 |
original_filename = os.path.basename(url)
|
257 |
safe_filename = re.sub(r'[^\w\-_\. ]', '_', original_filename)
|
258 |
temp_filename = f"{safe_filename}"
|
|
|
263 |
temp_filename += ext
|
264 |
|
265 |
with open(temp_filename, 'wb') as f:
|
266 |
+
f.write(response.content)
|
|
|
267 |
|
268 |
kind = filetype.guess(temp_filename)
|
269 |
if kind and kind.mime.startswith('image/'):
|
|
|
271 |
else:
|
272 |
return read_document(temp_filename, clean, url) # Otherwise, process as a document
|
273 |
|
274 |
+
except urllib3.exceptions.HTTPError as e:
|
275 |
+
return f"Error: {e}", 0
|
276 |
+
except Exception as e:
|
|
|
|
|
|
|
|
|
277 |
return f"Error downloading file: {e}", 0
|
278 |
|
279 |
# --- Gradio Interface ---
|