Spaces:
Sleeping
Sleeping
# import sentencepiece before transformers to avoid crushes | |
import sentencepiece | |
# for text generation | |
from transformers import pipeline | |
generator = pipeline("text-generation", model="distilgpt2") # remember to add length and returning numbers | |
# for NER | |
ner = pipeline("ner", grouped_entities=True) # usage: ner('text') | |
# for summarization | |
summarizer = pipeline("summarization") # usage: summarizer('text') | |
# for POS tagging | |
import nltk | |
nltk.download('punkt') | |
nltk.download('averaged_perceptron_tagger') | |
nltk.download('brown') | |
from textblob import TextBlob # blob = TextBlob(text) \n POS_List = blob.tags | |
# for translation | |
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM | |
tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-zh-en", use_fast = False) | |
model = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-zh-en") | |
import jionlp as jio #simtext = jio.tra2sim(tra_text, mode='char') | |
import gradio as gr | |
def TextProcessor(txt): | |
# ASCII code greater than 122 will be zh | |
if ord(str(txt)[0]) > 122: | |
# convert to zh_sim | |
sim_text = jio.tra2sim(txt, mode='char') | |
zh2en_trans = pipeline("translation_zh_to_en", model = model, tokenizer = tokenizer) | |
results = zh2en_trans(sim_text)[0]['translation_text'] | |
# ASCII code less than 122 will be en | |
else: | |
# if length greater than 1, sentences; otherwise, words | |
if len(txt.split()) < 2: | |
blob = TextBlob(txt) | |
POS_List = blob.tags | |
results = POS_List[0][1] | |
else: | |
# if txt contains ..., do text generation; otherwise do summary, NER, noun and verb phrases | |
if "..." or "…" in str(txt): | |
txt = str(txt) | |
text = txt[0:-3] | |
txt_generation = generator(text, max_length = 50, num_return_sequences = 1) | |
results = txt_generation[0]["generated_text"] | |
else: | |
txt = str(txt) | |
#txt_summarization = summarizer(txt) | |
#result_01 = txt_summarization[0] | |
#result_02 = ner(txt) | |
blob = TextBlob(txt) | |
POS_List = blob.tags | |
noun_phrases = [np for np in POS_List if "N" in np[1][0]] | |
result_03 = noun_phrases | |
verb_phrases = [vp for vp in POS_List if "V" in vp[1][0]] | |
result_04 = verb_phrases | |
results = ("noun_phrases:", result_03, "verb_phrases:", result_04) | |
#"Summary:", result_01['summary_text'], "NER:", result_02, | |
return results | |
final = gr.Interface(fn = TextProcessor, inputs = "text", outputs = "text") | |
final.launch() |