import gradio as gr import re import contractions import unicodedata import numpy as np import nltk nltk.download('punkt') nltk.download('stopwords') import os os.system('python -m spacy download en_core_web_sm') import spacy import en_core_web_sm nlp = en_core_web_sm.load() # nlp = spacy.load('en_core_web_sm') def spacy_lemmatize_text(text): text = nlp(text) text = ' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text]) return text def remove_accented_chars(text): text = unicodedata.normalize('NFC', text).encode('ascii', 'ignore').decode('utf-8', 'ignore') return text def remove_special_characters(text, remove_digits=False): pattern = r'[^a-zA-Z0-9\s]' if not remove_digits else r'[^a-zA-Z\s]' text = re.sub(pattern, '', text) return text def remove_stopwords(text, is_lower_case=False, stopwords=None): if not stopwords: stopwords = nltk.corpus.stopwords.words('english') tokens = nltk.word_tokenize(text) tokens = [token.strip() for token in tokens] if is_lower_case: filtered_tokens = [token for token in tokens if token not in stopwords] else: filtered_tokens = [token for token in tokens if token.lower() not in stopwords] filtered_text = ' '.join(filtered_tokens) return filtered_text def greet(sentence): opo_texto_sem_caracteres_especiais = (remove_accented_chars(sentence)) # sentenceMCTIList_base = nltk.word_tokenize(opo_texto_sem_caracteres_especiais) sentenceExpanded = contractions.fix(opo_texto_sem_caracteres_especiais) sentenceWithoutPunctuation = remove_special_characters(sentenceExpanded , remove_digits=True) sentenceLowered = sentenceWithoutPunctuation.lower() sentenceLemmatized = spacy_lemmatize_text(sentenceLowered) sentenceLemStopped = remove_stopwords(sentenceLemmatized, is_lower_case=False) return nltk.word_tokenize(sentenceLemStopped) iface = gr.Interface(fn=greet, inputs="text", outputs="text") iface.launch()