Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -91,28 +91,8 @@ FILE_EMOJIS = {
|
|
91 |
"mp3": "π΅",
|
92 |
}
|
93 |
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
Scans text like a caffeinated librarian on a mission, hunting for words that actually
|
98 |
-
mean something. Filters out boring words like 'the' and 'and' (sorry old friends),
|
99 |
-
while preserving the good stuff like 'quantum' and 'neural' (party time! π).
|
100 |
-
|
101 |
-
Think of it as a bouncer for your filenames - if a word isn't cool enough,
|
102 |
-
it's not getting in. But key phrases? VIP access, baby! π
|
103 |
-
|
104 |
-
Args:
|
105 |
-
text (str): The text to strip mine for linguistic gold
|
106 |
-
prioritize_start (bool): If True, treats the start like the cool kids' table
|
107 |
-
(default behavior because we're not monsters)
|
108 |
-
|
109 |
-
Returns:
|
110 |
-
list: The VIP list of words that made the cut. Maximum of 8 terms if we're
|
111 |
-
prioritizing the start (because YOLO), 5 otherwise (because sanity).
|
112 |
-
|
113 |
-
Warning: May occasionally let through a word that sounds smart but is actually
|
114 |
-
just showing off. We're working on its ego. π
|
115 |
-
"""
|
116 |
stop_words = set([
|
117 |
'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with',
|
118 |
'by', 'from', 'up', 'about', 'into', 'over', 'after', 'is', 'are', 'was', 'were',
|
@@ -120,9 +100,7 @@ def get_high_info_terms(text: str, prioritize_start=True) -> list:
|
|
120 |
'should', 'could', 'might', 'must', 'shall', 'can', 'may', 'this', 'that', 'these',
|
121 |
'those', 'i', 'you', 'he', 'she', 'it', 'we', 'they', 'what', 'which', 'who',
|
122 |
'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most',
|
123 |
-
'other', 'some', 'such', 'than', 'too', 'very', 'just', 'there'
|
124 |
-
'explain', 'show', 'give', 'write', 'provide', 'need', 'want', 'would', 'could',
|
125 |
-
'lets', 'let', 'try', 'use', 'make', 'help'
|
126 |
])
|
127 |
|
128 |
key_phrases = [
|
@@ -135,48 +113,23 @@ def get_high_info_terms(text: str, prioritize_start=True) -> list:
|
|
135 |
'research paper', 'scientific study', 'empirical analysis'
|
136 |
]
|
137 |
|
138 |
-
# First check for key phrases at the start of the text
|
139 |
-
start_phrases = []
|
140 |
-
lower_text = text.lower()
|
141 |
-
text_start = lower_text[:100] # Look at first 100 chars for starting phrases
|
142 |
-
for phrase in key_phrases:
|
143 |
-
if text_start.startswith(phrase) or text_start.find(f" {phrase}") >= 0:
|
144 |
-
start_phrases.append(phrase)
|
145 |
-
text = text.replace(phrase, '')
|
146 |
-
|
147 |
-
# Then check for key phrases in the rest of the text
|
148 |
preserved_phrases = []
|
|
|
149 |
for phrase in key_phrases:
|
150 |
-
if phrase in lower_text
|
151 |
preserved_phrases.append(phrase)
|
152 |
text = text.replace(phrase, '')
|
153 |
|
154 |
-
|
155 |
-
start_words = text.split()[:50]
|
156 |
-
words_with_pos = []
|
157 |
-
for pos, word in enumerate(start_words):
|
158 |
-
word = re.sub(r'[^\w\s-]', '', word.lower())
|
159 |
-
if (len(word) > 3 and
|
160 |
-
word not in stop_words and
|
161 |
-
not word.isdigit() and
|
162 |
-
any(c.isalpha() for c in word)):
|
163 |
-
words_with_pos.append((pos, word))
|
164 |
-
|
165 |
-
# Get remaining high-info words from the rest of the text
|
166 |
-
remaining_words = re.findall(r'\b\w+(?:-\w+)*\b', text[100:])
|
167 |
high_info_words = [
|
168 |
-
word.lower() for word in
|
169 |
if len(word) > 3
|
170 |
and word.lower() not in stop_words
|
171 |
and not word.isdigit()
|
172 |
and any(c.isalpha() for c in word)
|
173 |
]
|
174 |
|
175 |
-
|
176 |
-
start_terms = [word for _, word in sorted(words_with_pos, key=lambda x: x[0])][:3]
|
177 |
-
all_terms = (start_phrases + start_terms + preserved_phrases + high_info_words)
|
178 |
-
|
179 |
-
# Remove duplicates while preserving order
|
180 |
seen = set()
|
181 |
unique_terms = []
|
182 |
for term in all_terms:
|
@@ -184,50 +137,22 @@ def get_high_info_terms(text: str, prioritize_start=True) -> list:
|
|
184 |
seen.add(term)
|
185 |
unique_terms.append(term)
|
186 |
|
187 |
-
max_terms =
|
188 |
return unique_terms[:max_terms]
|
189 |
|
|
|
190 |
def generate_filename(content, file_type="md"):
|
191 |
-
"""π― #2 - The File Naming Sommelier (pairs well with frustrated developers)
|
192 |
-
|
193 |
-
Takes your content and turns it into a filename that's actually readable by humans!
|
194 |
-
A revolutionary concept, we know. Combines timestamps with meaningful words,
|
195 |
-
because '20231218_quantum_research' beats 'asdfg123.md' any day of the week.
|
196 |
-
|
197 |
-
Think of it as your personal file naming barista - takes your raw content beans
|
198 |
-
and turns them into a smooth, well-crafted filename. No foam art though, sorry! β
|
199 |
-
|
200 |
-
Args:
|
201 |
-
content (str): Your beautiful text that needs a home(name)
|
202 |
-
file_type (str): The file extension (defaults to "md" because we're markdown
|
203 |
-
hipsters at heart)
|
204 |
-
|
205 |
-
Returns:
|
206 |
-
str: A filename that won't make you question your life choices when you see it
|
207 |
-
in 6 months. Limited to 120 chars because we're not writing a novel here.
|
208 |
-
|
209 |
-
Pro Tip: If your filename ends up being just 'file.md', either your content was
|
210 |
-
empty or we've failed spectacularly. Please file a bug report or just
|
211 |
-
laugh it off. πͺ
|
212 |
-
"""
|
213 |
prefix = datetime.now().strftime("%y%m_%H%M") + "_"
|
214 |
-
|
215 |
-
# Get high-info terms with start prioritization
|
216 |
-
info_terms = get_high_info_terms(content, prioritize_start=True)
|
217 |
-
|
218 |
-
# Create filename with terms
|
219 |
name_text = '_'.join(term.replace(' ', '-') for term in info_terms) if info_terms else 'file'
|
220 |
|
221 |
-
|
222 |
-
max_length = 120 # Increased to allow more meaningful content
|
223 |
if len(name_text) > max_length:
|
224 |
name_text = name_text[:max_length]
|
225 |
|
226 |
filename = f"{prefix}{name_text}.{file_type}"
|
227 |
return filename
|
228 |
|
229 |
-
|
230 |
-
|
231 |
# 7. Audio Processing
|
232 |
def clean_for_speech(text: str) -> str:
|
233 |
text = text.replace("\n", " ")
|
|
|
91 |
"mp3": "π΅",
|
92 |
}
|
93 |
|
94 |
+
# 5. High-Information Content Extraction
|
95 |
+
def get_high_info_terms(text: str) -> list:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
96 |
stop_words = set([
|
97 |
'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with',
|
98 |
'by', 'from', 'up', 'about', 'into', 'over', 'after', 'is', 'are', 'was', 'were',
|
|
|
100 |
'should', 'could', 'might', 'must', 'shall', 'can', 'may', 'this', 'that', 'these',
|
101 |
'those', 'i', 'you', 'he', 'she', 'it', 'we', 'they', 'what', 'which', 'who',
|
102 |
'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most',
|
103 |
+
'other', 'some', 'such', 'than', 'too', 'very', 'just', 'there'
|
|
|
|
|
104 |
])
|
105 |
|
106 |
key_phrases = [
|
|
|
113 |
'research paper', 'scientific study', 'empirical analysis'
|
114 |
]
|
115 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
116 |
preserved_phrases = []
|
117 |
+
lower_text = text.lower()
|
118 |
for phrase in key_phrases:
|
119 |
+
if phrase in lower_text:
|
120 |
preserved_phrases.append(phrase)
|
121 |
text = text.replace(phrase, '')
|
122 |
|
123 |
+
words = re.findall(r'\b\w+(?:-\w+)*\b', text)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
124 |
high_info_words = [
|
125 |
+
word.lower() for word in words
|
126 |
if len(word) > 3
|
127 |
and word.lower() not in stop_words
|
128 |
and not word.isdigit()
|
129 |
and any(c.isalpha() for c in word)
|
130 |
]
|
131 |
|
132 |
+
all_terms = preserved_phrases + high_info_words
|
|
|
|
|
|
|
|
|
133 |
seen = set()
|
134 |
unique_terms = []
|
135 |
for term in all_terms:
|
|
|
137 |
seen.add(term)
|
138 |
unique_terms.append(term)
|
139 |
|
140 |
+
max_terms = 5
|
141 |
return unique_terms[:max_terms]
|
142 |
|
143 |
+
# 6. Filename Generation
|
144 |
def generate_filename(content, file_type="md"):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
145 |
prefix = datetime.now().strftime("%y%m_%H%M") + "_"
|
146 |
+
info_terms = get_high_info_terms(content)
|
|
|
|
|
|
|
|
|
147 |
name_text = '_'.join(term.replace(' ', '-') for term in info_terms) if info_terms else 'file'
|
148 |
|
149 |
+
max_length = 100
|
|
|
150 |
if len(name_text) > max_length:
|
151 |
name_text = name_text[:max_length]
|
152 |
|
153 |
filename = f"{prefix}{name_text}.{file_type}"
|
154 |
return filename
|
155 |
|
|
|
|
|
156 |
# 7. Audio Processing
|
157 |
def clean_for_speech(text: str) -> str:
|
158 |
text = text.replace("\n", " ")
|