KingNish commited on
Commit
116c368
·
verified ·
1 Parent(s): fab1175

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +46 -12
app.py CHANGED
@@ -7,15 +7,51 @@ import re
7
  import zipfile
8
  import xml.etree.ElementTree as ET
9
  import filetype
10
- import requests
11
  import os
12
  import mimetypes
13
  from bs4 import BeautifulSoup
14
- from urllib.parse import urljoin
15
 
16
  # Constants
17
  CHUNK_SIZE = 32000
18
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  # --- Utility Functions ---
20
 
21
  def xml2text(xml):
@@ -213,7 +249,10 @@ def download_and_process_file(url, clean=True):
213
  url = "http://" + url # Prepend "http://" if not present
214
 
215
  try:
216
- response = requests.get(url, stream=True, timeout=10)
 
 
 
217
  original_filename = os.path.basename(url)
218
  safe_filename = re.sub(r'[^\w\-_\. ]', '_', original_filename)
219
  temp_filename = f"{safe_filename}"
@@ -224,8 +263,7 @@ def download_and_process_file(url, clean=True):
224
  temp_filename += ext
225
 
226
  with open(temp_filename, 'wb') as f:
227
- for chunk in response.iter_content(chunk_size=8192000):
228
- f.write(chunk)
229
 
230
  kind = filetype.guess(temp_filename)
231
  if kind and kind.mime.startswith('image/'):
@@ -233,13 +271,9 @@ def download_and_process_file(url, clean=True):
233
  else:
234
  return read_document(temp_filename, clean, url) # Otherwise, process as a document
235
 
236
- except requests.exceptions.MissingSchema:
237
- return "Error: Invalid URL format. Even after adding 'http://', the URL is still invalid.", 0
238
- except requests.exceptions.ConnectionError:
239
- return "Error: Could not connect to the server. Please check your internet connection.", 0
240
- except requests.exceptions.Timeout:
241
- return "Error: Connection timed out while trying to fetch the URL.", 0
242
- except requests.exceptions.RequestException as e:
243
  return f"Error downloading file: {e}", 0
244
 
245
  # --- Gradio Interface ---
 
7
  import zipfile
8
  import xml.etree.ElementTree as ET
9
  import filetype
 
10
  import os
11
  import mimetypes
12
  from bs4 import BeautifulSoup
13
+ import urllib3
14
 
15
  # Constants
16
  CHUNK_SIZE = 32000
17
 
18
+ # --- Custom HTTP Session and Response Classes ---
19
+
20
+ class CustomSession:
21
+ def __init__(self):
22
+ self.pool_manager = urllib3.PoolManager()
23
+
24
+ def get(self, url):
25
+ response = self.pool_manager.request('GET', url)
26
+ return CustomResponse(response)
27
+
28
+ class CustomResponse:
29
+ def __init__(self, response):
30
+ self.status_code = response.status
31
+ self.headers = response.headers
32
+ self.content = response.data
33
+
34
+ def json(self):
35
+ import json
36
+ return json.loads(self.content)
37
+
38
+ def text(self):
39
+ return self.content.decode('utf-8')
40
+
41
+ def soup(self):
42
+ return BeautifulSoup(self.content, 'lxml')
43
+
44
+ def clean_text(self):
45
+ soup = self.soup()
46
+ cleaned_text = soup.get_text().replace('\n', ' ').replace('\r', ' ').replace(' ', ' ')
47
+ while ' ' in cleaned_text:
48
+ cleaned_text = cleaned_text.replace(' ', ' ')
49
+ return cleaned_text.strip()
50
+
51
+ def get(url):
52
+ session = CustomSession()
53
+ return session.get(url)
54
+
55
  # --- Utility Functions ---
56
 
57
  def xml2text(xml):
 
249
  url = "http://" + url # Prepend "http://" if not present
250
 
251
  try:
252
+ response = get(url)
253
+ if response.status_code != 200:
254
+ return f"Error: Received status code {response.status_code} from the server.", 0
255
+
256
  original_filename = os.path.basename(url)
257
  safe_filename = re.sub(r'[^\w\-_\. ]', '_', original_filename)
258
  temp_filename = f"{safe_filename}"
 
263
  temp_filename += ext
264
 
265
  with open(temp_filename, 'wb') as f:
266
+ f.write(response.content)
 
267
 
268
  kind = filetype.guess(temp_filename)
269
  if kind and kind.mime.startswith('image/'):
 
271
  else:
272
  return read_document(temp_filename, clean, url) # Otherwise, process as a document
273
 
274
+ except urllib3.exceptions.HTTPError as e:
275
+ return f"Error: {e}", 0
276
+ except Exception as e:
 
 
 
 
277
  return f"Error downloading file: {e}", 0
278
 
279
  # --- Gradio Interface ---