titipata commited on
Commit
9cb9653
1 Parent(s): 1673527

Using spacy for sentence tokenization

Browse files
Files changed (1) hide show
  1. app.py +6 -1
app.py CHANGED
@@ -1,7 +1,10 @@
1
  from transformers import AutoTokenizer
2
  from transformers import AutoModelForSequenceClassification
 
3
  import gradio as gr
4
 
 
 
5
  model_name = "biodatlab/score-claim-identification"
6
  tokenizer_name = "allenai/scibert_scivocab_uncased"
7
 
@@ -13,8 +16,10 @@ def inference(abstract: str):
13
  """
14
  Split an abstract into sentences and perform claim identification.
15
  """
 
 
16
  claims = []
17
- sents = abstract.split('. ')
18
  inputs = tokenizer(
19
  sents,
20
  return_tensors="pt",
 
1
  from transformers import AutoTokenizer
2
  from transformers import AutoModelForSequenceClassification
3
+ import spacy
4
  import gradio as gr
5
 
6
+ spacy.cli.download("en_core_web_lg")
7
+ nlp = spacy.load("en_core_web_lg")
8
  model_name = "biodatlab/score-claim-identification"
9
  tokenizer_name = "allenai/scibert_scivocab_uncased"
10
 
 
16
  """
17
  Split an abstract into sentences and perform claim identification.
18
  """
19
+ if abstract.strip() == "":
20
+ return "Please provide an abstract as an input."
21
  claims = []
22
+ sents = [sent.text for sent in nlp(abstract).sents] # abstract to sentences
23
  inputs = tokenizer(
24
  sents,
25
  return_tensors="pt",