Text_Processor / app.py
roygryan's picture
Update app.py
c43e5a6
# import sentencepiece before transformers to avoid crushes
import sentencepiece
# for text generation
from transformers import pipeline
generator = pipeline("text-generation", model="distilgpt2") # remember to add length and returning numbers
# for NER
ner = pipeline("ner", grouped_entities=True) # usage: ner('text')
# for summarization
summarizer = pipeline("summarization") # usage: summarizer('text')
# for POS tagging
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('brown')
from textblob import TextBlob # blob = TextBlob(text) \n POS_List = blob.tags
# for translation
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-zh-en", use_fast = False)
model = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-zh-en")
import jionlp as jio #simtext = jio.tra2sim(tra_text, mode='char')
import gradio as gr
def TextProcessor(txt):
# ASCII code greater than 122 will be zh
if ord(str(txt)[0]) > 122:
# convert to zh_sim
sim_text = jio.tra2sim(txt, mode='char')
zh2en_trans = pipeline("translation_zh_to_en", model = model, tokenizer = tokenizer)
results = zh2en_trans(sim_text)[0]['translation_text']
# ASCII code less than 122 will be en
else:
# if length greater than 1, sentences; otherwise, words
if len(txt.split()) < 2:
blob = TextBlob(txt)
POS_List = blob.tags
results = POS_List[0][1]
else:
# if txt contains ..., do text generation; otherwise do summary, NER, noun and verb phrases
if "..." or "…" in str(txt):
txt = str(txt)
text = txt[0:-3]
txt_generation = generator(text, max_length = 50, num_return_sequences = 1)
results = txt_generation[0]["generated_text"]
else:
txt = str(txt)
#txt_summarization = summarizer(txt)
#result_01 = txt_summarization[0]
#result_02 = ner(txt)
blob = TextBlob(txt)
POS_List = blob.tags
noun_phrases = [np for np in POS_List if "N" in np[1][0]]
result_03 = noun_phrases
verb_phrases = [vp for vp in POS_List if "V" in vp[1][0]]
result_04 = verb_phrases
results = ("noun_phrases:", result_03, "verb_phrases:", result_04)
#"Summary:", result_01['summary_text'], "NER:", result_02,
return results
final = gr.Interface(fn = TextProcessor, inputs = "text", outputs = "text")
final.launch()