suvadityamuk commited on
Commit
d6cf17e
·
1 Parent(s): 3dd4599

Signed-off-by: Suvaditya Mukherjee <suvadityamuk@gmail.com>

Files changed (1) hide show
  1. utils.py +85 -0
utils.py ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gdown
2
+ import os
3
+ from urllib.parse import urlparse, parse_qs
4
+
5
+ def download_pdf_from_gdrive(url, output_path=None):
6
+ """
7
+ Download a PDF file from Google Drive using the provided sharing URL.
8
+
9
+ Parameters:
10
+ url (str): The Google Drive sharing URL of the PDF file
11
+ output_path (str, optional): The path where the PDF should be saved.
12
+ If not provided, saves in current directory.
13
+
14
+ Returns:
15
+ str: Path to the downloaded file if successful, None if failed
16
+
17
+ Raises:
18
+ ValueError: If the URL is invalid or doesn't point to a Google Drive file
19
+ """
20
+ try:
21
+ # Check if URL is provided
22
+ if not url:
23
+ raise ValueError("URL cannot be empty")
24
+
25
+ # Handle different types of Google Drive URLs
26
+ if 'drive.google.com' not in url:
27
+ raise ValueError("Not a valid Google Drive URL")
28
+
29
+ # Extract file ID from the URL
30
+ if '/file/d/' in url:
31
+ file_id = url.split('/file/d/')[1].split('/')[0]
32
+ elif 'id=' in url:
33
+ file_id = parse_qs(urlparse(url).query)['id'][0]
34
+ else:
35
+ raise ValueError("Could not extract file ID from the URL")
36
+
37
+ # Set default output path if none provided
38
+ if not output_path:
39
+ output_path = 'downloaded_file.pdf'
40
+
41
+ # Ensure the output path ends with .pdf
42
+ if not output_path.lower().endswith('.pdf'):
43
+ output_path += '.pdf'
44
+
45
+ # Create the directory if it doesn't exist
46
+ os.makedirs(os.path.dirname(output_path) if os.path.dirname(output_path) else '.', exist_ok=True)
47
+
48
+ # Download the file
49
+ output = gdown.download(id=file_id, output=output_path, quiet=False)
50
+
51
+ if output is None:
52
+ raise ValueError("Download failed - file might be inaccessible or not exist")
53
+
54
+ return output
55
+
56
+ except Exception as e:
57
+ print(f"Error downloading PDF: {str(e)}")
58
+ return None
59
+
60
+ def merge_strings_with_prefix(strings):
61
+ """Merges strings in a list that start with a specific prefix.
62
+
63
+ Args:
64
+ strings: A list of strings.
65
+
66
+ Returns:
67
+ A new list of merged strings.
68
+ """
69
+
70
+ result = []
71
+ current_merged_string = ""
72
+
73
+ for string in strings:
74
+ if string.startswith("•"):
75
+ if current_merged_string:
76
+ result.append(current_merged_string)
77
+ current_merged_string = string
78
+ else:
79
+ current_merged_string += string
80
+
81
+ if current_merged_string:
82
+ result.append(current_merged_string)
83
+
84
+ return result
85
+