import os import re import gradio as gr import torch from huggingface_hub import hf_hub_download from torch.utils.data import DataLoader, Dataset model_file_path = hf_hub_download("meliascosta/wiki_hiagm", "hiagm_dyn_quant.p", use_auth_token=os.environ['TOKEN']) vocab_file_path = hf_hub_download("meliascosta/wiki_hiagm", "vocab/word.dict", use_auth_token=os.environ['TOKEN']) label_file_path = hf_hub_download("meliascosta/wiki_hiagm", "vocab/label.dict", use_auth_token=os.environ['TOKEN']) description = "A demo of the [HiAGM model](https://github.com/Alibaba-NLP/HiAGM) trained on wikipedia academic subject data. The trained model was quantized to reduce size." examples = ["MicroRNA (miRNA) are small, single-stranded, non-coding RNA molecules containing 21 to 23 nucleotides. Found in plants, animals and some viruses, miRNAs are involved in RNA silencing and post-transcriptional regulation of gene expression.[1][2] miRNAs base-pair to complementary sequences in mRNA molecules,[3] then gene silence said mRNA molecules by one or more of the following processes: (1) cleavage of mRNA strand into two pieces, (2) destabilization of mRNA by shortening its poly(A) tail, or (3) translation of mRNA into proteins. This last method of gene silencing is the least efficient of the three, and requires the aid of ribosomes.", "What is an affidavit? From family law and bankruptcy matters to civil and criminal cases, affidavits are common legal documents used in a variety of court proceedings. And when used, it’s just like taking an oath in court. Let’s dive into what an affidavit is, its purposes, and examples of instances when you might use one. Affidavit definition.", "Napoleon Bonaparte[a] (born Napoleone Buonaparte; 15 August 1769 – 5 May 1821), later known by his regnal name Napoleon I,[b] was a French military commander and political leader who rose to prominence during the French Revolution and led successful campaigns during the Revolutionary Wars. He was the de facto leader of the French Republic as First Consul from 1799 to 1804, then Emperor of the French from 1804 until 1814 and again in 1815. Napoleon's political and cultural legacy endures to this day, as a highly celebrated and controversial leader. He initiated many liberal reforms that have persisted in society, and is considered one of the greatest military commanders in history. His wars and campaigns are studied by militaries all over the world. Between three and six million civilians and soldiers perished in what became known as the Napoleonic Wars." ] # Load model # m = torch.load('./temp_dyn.p') model = torch.load(model_file_path, map_location=torch.device('cpu')) model.eval() MAX_INPUT_LENGTH = 256 K_TOP = 3 BATCH_SIZE = 512 P_THRESHOLD = 0.5 class CustomImageDataset(Dataset): def __init__(self, samples): self.samples = samples def __len__(self): return len(self.samples) def __getitem__(self, idx): return self.samples[idx] # Load vocab mappings v2i = {} i2v = {} with open(vocab_file_path) as f: for i, line in enumerate(f): data = line.rstrip().split('\t') assert len(data) == 2 v2i[data[0]] = i i2v[i] = data[0] v2i_lab = {} i2v_lab = {} with open(label_file_path) as f: for i, line in enumerate(f): data = line.rstrip().split('\t') assert len(data) == 2 v2i_lab[data[0]] = i i2v_lab[i] = data[0] ### PREPROCESSING english_stopwords = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't", '\\.', '\\?', ',', '\\!', "'s", ''] def clean_stopwords(sample): """ :param sample: List[Str], lower case :return: List[Str] """ return [token for token in sample if token not in english_stopwords] def clean_str(string): """ Original Source: https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py :param string: Str :return -> Str """ string = string.strip().strip('"') string = re.sub(r"[^A-Za-z(),!?\.\'\`]", " ", string) string = re.sub(r"\'s", " \'s", string) string = re.sub(r"\'ve", " \'ve", string) string = re.sub(r"n\'t", " n\'t", string) string = re.sub(r"\'re", " \'re", string) string = re.sub(r"\'d", " \'d", string) string = re.sub(r"\'ll", " \'ll", string) string = re.sub(r",", " , ", string) string = re.sub(r"\.", " \. ", string) string = re.sub(r"\"", " , ", string) string = re.sub(r"!", " ! ", string) string = re.sub(r"\(", " \( ", string) string = re.sub(r"\)", " \) ", string) string = re.sub(r"\?", " \? ", string) string = re.sub(r"\s{2,}", " ", string) return string.strip().lower() def preprocess_line(sample): """ :param sample: Str, "The sample would be tokenized and filtered according to the stopwords list" :return: token_list -> List[Str] """ sample = clean_str(sample.lstrip().rstrip()) token_list = clean_stopwords(sample.split(' ')) return {'token': token_list, 'label': []} def predict(line): preprocessed_line = preprocess_line(line) sample = {} sample["token"] = [v2i.get(v.lower(), v2i['']) for v in preprocessed_line['token']] sample["token_len"] = len(sample['token']) sample["token"] = torch.tensor(sample["token"]) sample["token_len"] = torch.tensor(sample["token_len"]) output_logits = model(next(iter(DataLoader(CustomImageDataset([sample]))))) output_probs = torch.sigmoid(output_logits).cpu().tolist() return {i2v_lab[i]:p for i, p in enumerate(output_probs[0]) if p > P_THRESHOLD} iface = gr.Interface(fn=predict, inputs="text", outputs=gr.Label(num_top_classes=5), title="What Academic subject?", description=description, examples=examples, ) iface.launch()