nielklug commited on
Commit
7884ed6
1 Parent(s): 920b22f
Files changed (2) hide show
  1. app.py +1 -1
  2. parse.py +27 -0
app.py CHANGED
@@ -1,5 +1,5 @@
1
  import streamlit as st
2
- from parse import parse_text
3
  from nltk import Tree
4
  import pandas as pd
5
  import re
 
1
  import streamlit as st
2
+ # from parse import parse_text
3
  from nltk import Tree
4
  import pandas as pd
5
  import re
parse.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import sys
3
+ import benepar
4
+ from huggingface_hub import hf_hub_download
5
+
6
+ model_path = "ParserModels/ENHG/new-convbert-german-europeana0_dev=83.03.pt"
7
+ hf_hub_download(repo_id=model_path, filename='german-delex-parser_dev=83.10.pt')
8
+ parser = benepar.Parser(model_path)
9
+
10
+ def parse(words):
11
+ words = [word.replace('(','-LRB-').replace(')','-RRB-') for word in words]
12
+ input_sentence = benepar.InputSentence(words=words)
13
+ tree = parser.parse(input_sentence)
14
+ tree = str(tree).replace('-LRB-','\\(').replace('-RRB-','\\)').replace('-LSB-','\\[').replace('-RSB-','\\]').replace('($(-','($\\(-')
15
+ # put the whole parse tree on a single line
16
+ tree = re.sub(r'\s+', ' ', tree.strip())
17
+ tree = re.sub(r' \(', '(', tree)
18
+ return tree
19
+
20
+
21
+ with open(sys.argv[1]) as file:
22
+ for line in file:
23
+ line = re.sub(r'(\S)([.,;:?!)"])', r'\1 \2', line.strip())
24
+ line = re.sub(r'(["(])(\S)', r'\1 \2', line)
25
+ words = line.split()
26
+ tree = parse(words)
27
+ print(tree)