PeteBleackley commited on
Commit
e149b0f
1 Parent(s): c106121

Coreference Resolution for WikiQA dataset

Browse files
.gitignore CHANGED
@@ -1,3 +1,5 @@
1
  *.json
2
  */__pycache__/*
3
  *.pyc
 
 
 
1
  *.json
2
  */__pycache__/*
3
  *.pyc
4
+ *.tsv
5
+ *.csv
DataSets.md CHANGED
@@ -8,6 +8,8 @@ We are planning to use the following datasets to train the models.
8
 
9
  ## Question Answering
10
 
 
 
11
  ## Reasoning
12
 
13
  [Avicenna: Syllogistic Commonsense Reasoning](https://github.com/ZeinabAghahadi/Syllogistic-Commonsense-Reasoning)
 
8
 
9
  ## Question Answering
10
 
11
+ [WikiQA (Wikipedia Open-Domain Question Answering](https://paperswithcode.com/dataset/wikiqa)
12
+
13
  ## Reasoning
14
 
15
  [Avicenna: Syllogistic Commonsense Reasoning](https://github.com/ZeinabAghahadi/Syllogistic-Commonsense-Reasoning)
qarac/utils/CoreferenceResolver.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ Created on Mon Sep 11 09:46:51 2023
5
+
6
+ @author: peter
7
+ """
8
+
9
+ from allennlp.predictors.predictor import Predictor
10
+ import pandas
11
+
12
+ def clean(sentence):
13
+ return sentence if sentence.strip().endswith('.') else sentence+'.'
14
+
15
+ class CoreferenceResolver(object):
16
+
17
+ def __init__(self):
18
+ model_url = "https://storage.googleapis.com/allennlp-public-models/coref-spanbert-large-2020.02.27.tar.gz"
19
+ self.predictor = Predictor.from_path(model_url)
20
+
21
+ def __call__(self,group):
22
+ tokenized = group.apply(clean).str.split()
23
+ line_breaks = tokenized.apply(len).cumsum()
24
+ doc = []
25
+ for line in tokenized:
26
+ doc.extend(line)
27
+ clusters = self.predictor.predict_tokenized(doc)
28
+ resolutions = {}
29
+ for cluster in clusters['clusters']:
30
+ starts = []
31
+ longest = -1
32
+ canonical = None
33
+ for [start_pos,end_pos] in cluster:
34
+ resolutions[start_pos]={'end':end_pos+1}
35
+ starts.append(start_pos)
36
+ length = end_pos - start_pos
37
+ if length > longest:
38
+ longest = length
39
+ canonical = doc[start_pos:end_pos+1]
40
+ for start in starts:
41
+ resolutions[start]['canonical']=canonical
42
+ doc_pos = 0
43
+ line = 0
44
+ results = []
45
+ current = []
46
+ while doc_pos < len(doc):
47
+ if doc_pos in resolutions:
48
+ current.extend(resolutions[doc_pos]['canonical'])
49
+ doc_pos=resolutions[doc_pos]['end']
50
+ else:
51
+ current.append(doc[doc_pos])
52
+ doc_pos+=1
53
+ if doc_pos>=line_breaks.iloc[line]:
54
+ results.append(' '.join(current))
55
+ line+=1
56
+ current = []
57
+ return pandas.Series(results,
58
+ index=group.index)
59
+
60
+
61
+
62
+
63
+
64
+
qarac/utils/__init__.py ADDED
File without changes
requirements.txt CHANGED
@@ -8,3 +8,5 @@ transformers
8
  spacy
9
  spacy-experimental
10
  pandas
 
 
 
8
  spacy
9
  spacy-experimental
10
  pandas
11
+ allennlp
12
+ allennlp-models
scripts.py CHANGED
@@ -1,5 +1,6 @@
1
 
2
  import os
 
3
  import argparse
4
  import pickle
5
  import tokenizers
@@ -9,8 +10,10 @@ import qarac.models.qarac_base_model
9
  import keras
10
  import tensorflow
11
  import spacy
12
- import spacy_experimental
13
  import pandas
 
 
 
14
 
15
  def decoder_loss(y_true,y_pred):
16
  return keras.losses.sparse_categorical_crossentropy(y_true,
@@ -28,13 +31,17 @@ def clean_question(doc):
28
 
29
  def prepare_wiki_qa(filename,outfilename):
30
  data = pandas.read_csv(filename,sep='\t')
 
31
  nlp = spacy.load('en_core_web_trf')
32
- nlp.add_pipe('experimental_coref')
33
- data['Resolved_answer'] = pandas.Series([sent.text
34
- for doc in nlp.pipe(data.groupby('DocumentID')['Sentence'].apply(lambda x: ' '.join(x)))
35
- for sent in doc.sentences])
36
- data['Cleaned_questions']=pandas.Series([clean_question(doc) for doc in nlp.pipe(data)])
37
- data[['Cleaned_questions','Resolved_answers','Label']].to_csv(outfilename)
 
 
 
38
 
39
 
40
  def train_base_model(task,filename):
@@ -71,7 +78,10 @@ if __name__ == '__main__':
71
  parser.add_argument('task')
72
  parser.add_argument('-f','--filename')
73
  parser.add_argument('-t','--training-task')
 
74
  args = parser.parse_args()
75
  if args.task == 'train_base_model':
76
  train_base_model(args.training_task,args.filename)
 
 
77
 
 
1
 
2
  import os
3
+ import re
4
  import argparse
5
  import pickle
6
  import tokenizers
 
10
  import keras
11
  import tensorflow
12
  import spacy
 
13
  import pandas
14
+ import qarac.utils.CoreferenceResolver
15
+
16
+
17
 
18
  def decoder_loss(y_true,y_pred):
19
  return keras.losses.sparse_categorical_crossentropy(y_true,
 
31
 
32
  def prepare_wiki_qa(filename,outfilename):
33
  data = pandas.read_csv(filename,sep='\t')
34
+ data['QNum']=data['QuestionID'].apply(lambda x: int(x[1:]))
35
  nlp = spacy.load('en_core_web_trf')
36
+ predictor = qarac.utils.CoreferenceResolver.CoreferenceResolver()
37
+ data['Resolved_answer'] = data.groupby('QNum')['Sentence'].transform(predictor)
38
+ unique_questions = data.groupby('QNum')['Question'].first()
39
+ cleaned_questions = pandas.Series([clean_question(doc)
40
+ for doc in nlp.pipe(unique_questions)],
41
+ index = unique_questions.index)
42
+ for (i,question) in cleaned_questions.items():
43
+ data.loc[data['QNum']==i,'Cleaned_question']=question
44
+ data[['Cleaned_question','Resolved_answer','Label']].to_csv(outfilename)
45
 
46
 
47
  def train_base_model(task,filename):
 
78
  parser.add_argument('task')
79
  parser.add_argument('-f','--filename')
80
  parser.add_argument('-t','--training-task')
81
+ parser.add_argument('-o','--outputfile')
82
  args = parser.parse_args()
83
  if args.task == 'train_base_model':
84
  train_base_model(args.training_task,args.filename)
85
+ elif args.task == 'prepare_wiki_qa':
86
+ prepare_wiki_qa(args.filename,args.outputfile)
87