Spaces:
Runtime error
Runtime error
Update normalizer.py
Browse files- normalizer.py +69 -13
normalizer.py
CHANGED
@@ -4,6 +4,12 @@ import pandas as pd
|
|
4 |
from aiogoogletrans import Translator
|
5 |
from spellchecker import SpellChecker
|
6 |
from nltk.tokenize import RegexpTokenizer
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
|
8 |
|
9 |
class Normalizer:
|
@@ -353,7 +359,7 @@ class Normalizer:
|
|
353 |
print(f"An error occurred during character repetition removal: {e}")
|
354 |
return word
|
355 |
|
356 |
-
def translate_text(self, text: str) -> str:
|
357 |
"""
|
358 |
Translate the given text to English and return the translated text.
|
359 |
|
@@ -364,19 +370,14 @@ class Normalizer:
|
|
364 |
- str: The translated text.
|
365 |
"""
|
366 |
try:
|
367 |
-
|
368 |
-
translated_text = (
|
369 |
-
loop.run_until_complete(self._translator.translate(text))
|
370 |
-
.text.lower()
|
371 |
-
.strip()
|
372 |
-
)
|
373 |
except Exception as e:
|
374 |
print(f"Text Translation failed: {e}")
|
375 |
-
translated_text = (
|
376 |
-
text.lower().strip()
|
377 |
-
) # Use original text if translation fails
|
378 |
return translated_text
|
379 |
|
|
|
380 |
def check_spelling(self, query: str) -> str:
|
381 |
"""
|
382 |
Check the spelling of the input query and return the corrected version.
|
@@ -388,9 +389,8 @@ class Normalizer:
|
|
388 |
- str: The corrected query.
|
389 |
"""
|
390 |
try:
|
391 |
-
# Detect the language of the input query
|
392 |
-
|
393 |
-
input_language = "en" if query.encode().isalpha() else "ar"
|
394 |
|
395 |
# Initialize SpellChecker with detected language, fallback to English if language detection fails
|
396 |
try:
|
@@ -423,6 +423,62 @@ class Normalizer:
|
|
423 |
except Exception as e:
|
424 |
print(f"An error occurred during spelling check: {e}")
|
425 |
return query
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
426 |
|
427 |
def clean_text(self, text):
|
428 |
"""
|
|
|
4 |
from aiogoogletrans import Translator
|
5 |
from spellchecker import SpellChecker
|
6 |
from nltk.tokenize import RegexpTokenizer
|
7 |
+
from langdetect import detect
|
8 |
+
from fake_useragent import UserAgent
|
9 |
+
import json
|
10 |
+
import requests
|
11 |
+
from fastapi import HTTPException
|
12 |
+
from main import logger
|
13 |
|
14 |
|
15 |
class Normalizer:
|
|
|
359 |
print(f"An error occurred during character repetition removal: {e}")
|
360 |
return word
|
361 |
|
362 |
+
async def translate_text(self, text: str) -> str:
|
363 |
"""
|
364 |
Translate the given text to English and return the translated text.
|
365 |
|
|
|
370 |
- str: The translated text.
|
371 |
"""
|
372 |
try:
|
373 |
+
translated_text = await self._translator.translate(text, dest="en")
|
374 |
+
translated_text = translated_text.text.lower().strip()
|
|
|
|
|
|
|
|
|
375 |
except Exception as e:
|
376 |
print(f"Text Translation failed: {e}")
|
377 |
+
translated_text = text.lower().strip() # Use original text if translation fails
|
|
|
|
|
378 |
return translated_text
|
379 |
|
380 |
+
|
381 |
def check_spelling(self, query: str) -> str:
|
382 |
"""
|
383 |
Check the spelling of the input query and return the corrected version.
|
|
|
389 |
- str: The corrected query.
|
390 |
"""
|
391 |
try:
|
392 |
+
# Detect the language of the input query
|
393 |
+
input_language = detect(query)
|
|
|
394 |
|
395 |
# Initialize SpellChecker with detected language, fallback to English if language detection fails
|
396 |
try:
|
|
|
423 |
except Exception as e:
|
424 |
print(f"An error occurred during spelling check: {e}")
|
425 |
return query
|
426 |
+
|
427 |
+
def query_suggestions(self, query: str) -> str:
|
428 |
+
"""
|
429 |
+
Get suggestions for a given query string using Google's Suggest API.
|
430 |
+
|
431 |
+
Parameters:
|
432 |
+
query (str): The query string for which suggestions are to be retrieved.
|
433 |
+
|
434 |
+
Returns:
|
435 |
+
str: The suggested query string or the original query if no suggestions are available.
|
436 |
+
|
437 |
+
Raises:
|
438 |
+
HTTPException: If an HTTP error occurs during the request to Google Suggest API.
|
439 |
+
"""
|
440 |
+
|
441 |
+
try:
|
442 |
+
# Detect language of the query
|
443 |
+
lang = detect(query)
|
444 |
+
lang = 'en' if lang != 'ar' else 'ar'
|
445 |
+
|
446 |
+
# Prepare the query for the URL
|
447 |
+
modified_query = query.replace(" ", "+")
|
448 |
+
|
449 |
+
# Construct the URL with the query and language
|
450 |
+
url = f"http://suggestqueries.google.com/complete/search?output=firefox&gl={lang}&hl=sa&q={modified_query}"
|
451 |
+
|
452 |
+
# logger.info(f"Google Suggest API URL: {url}")
|
453 |
+
# Generate a random user-agent
|
454 |
+
ua = UserAgent()
|
455 |
+
headers = {"user-agent": ua.chrome}
|
456 |
+
|
457 |
+
# Make the request to the Google Suggest API
|
458 |
+
response = requests.get(url, headers=headers, verify=True)
|
459 |
+
|
460 |
+
if response.status_code != 200:
|
461 |
+
raise HTTPException(
|
462 |
+
status_code=response.status_code,
|
463 |
+
detail=f"An error occurred during the request to Google Suggest API: {response.text}"
|
464 |
+
)
|
465 |
+
# Parse the response JSON
|
466 |
+
suggestions = json.loads(response.text)
|
467 |
+
# logger.info(f"Google Suggest API Response: {suggestions[1]}")
|
468 |
+
|
469 |
+
# If suggestions are available, return the first one
|
470 |
+
if suggestions[1]:
|
471 |
+
return suggestions[1][0]
|
472 |
+
|
473 |
+
# If no suggestions are returned, return the original query
|
474 |
+
return query
|
475 |
+
|
476 |
+
except Exception as e:
|
477 |
+
# If any other exception occurs, raise an HTTPException
|
478 |
+
raise HTTPException(
|
479 |
+
status_code=500,
|
480 |
+
detail=f"An error occurred during the request to Google Suggest API: {e}"
|
481 |
+
)
|
482 |
|
483 |
def clean_text(self, text):
|
484 |
"""
|