File size: 1,417 Bytes
e71e3dd 858e6e8 e71e3dd 831abcc 858e6e8 e71e3dd 24c6141 e71e3dd 24c6141 7a0cf24 24c6141 e71e3dd f3f21ea 7a0cf24 f3f21ea e71e3dd 5049d16 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 |
from typing import List, Tuple
import torch
import nltk
from SciAssist import DatasetExtraction
device = "gpu" if torch.cuda.is_available() else "cpu"
de_pipeline = DatasetExtraction(os_name="nt")
def de_for_str(input):
list_input = nltk.sent_tokenize(input)
results = de_pipeline.extract(list_input, type="str", save_results=False)
# output = []
# for res in results["dataset_mentions"]:
# output.append(f"{res}\n\n")
# return "".join(output)
output = []
for mention_pair in results["dataset_mentions"]:
output.append((mention_pair[0], mention_pair[1]))
output.append(("\n\n", None))
return output
def de_for_file(input):
if input == None:
return None
filename = input.name
# Identify the format of input and parse reference strings
if filename[-4:] == ".txt":
results = de_pipeline.extract(filename, type="txt", save_results=False)
elif filename[-4:] == ".pdf":
results = de_pipeline.extract(filename, type="pdf", save_results=False)
else:
return [("File Format Error !", None)]
output = []
for mention_pair in results["dataset_mentions"]:
output.append((mention_pair[0], mention_pair[1]))
output.append(("\n\n", None))
return output
de_str_example = "BAKIS incorporates information derived from the bank balance sheets and supervisory reports of all German banks ." |