Update README.md
Browse files
README.md
CHANGED
@@ -87,7 +87,7 @@ def is_subset(text1, text2):
|
|
87 |
def cleaning(text, tags):
|
88 |
return [tag for tag in tags if is_subset(text, tag)]
|
89 |
|
90 |
-
def get_texts(
|
91 |
texts = list(filter(lambda x : x != '', text.split('\n\n')))
|
92 |
lengths = [len(tokenizer.encode(paragraph)) for paragraph in texts]
|
93 |
output = []
|
@@ -99,7 +99,7 @@ def get_texts(self, text, max_len):
|
|
99 |
output.append(par)
|
100 |
return output
|
101 |
|
102 |
-
def get_tags(
|
103 |
input_text = 'summarize: ' + text.strip().replace('\n', ' ')
|
104 |
tokenized_text = tokenizer.encode(input_text, return_tensors="pt")
|
105 |
with torch.no_grad():
|
@@ -115,7 +115,7 @@ def get_tags(self, text, generate_kwargs):
|
|
115 |
|
116 |
return list(set(itertools.chain(*output)))
|
117 |
|
118 |
-
def tag(
|
119 |
texts = get_texts(text, max_len)
|
120 |
all_tags = [get_tags(text, generate_kwargs) for text in texts]
|
121 |
flatten_tags = itertools.chain(*all_tags)
|
|
|
87 |
def cleaning(text, tags):
|
88 |
return [tag for tag in tags if is_subset(text, tag)]
|
89 |
|
90 |
+
def get_texts(text, max_len):
|
91 |
texts = list(filter(lambda x : x != '', text.split('\n\n')))
|
92 |
lengths = [len(tokenizer.encode(paragraph)) for paragraph in texts]
|
93 |
output = []
|
|
|
99 |
output.append(par)
|
100 |
return output
|
101 |
|
102 |
+
def get_tags(text, generate_kwargs):
|
103 |
input_text = 'summarize: ' + text.strip().replace('\n', ' ')
|
104 |
tokenized_text = tokenizer.encode(input_text, return_tensors="pt")
|
105 |
with torch.no_grad():
|
|
|
115 |
|
116 |
return list(set(itertools.chain(*output)))
|
117 |
|
118 |
+
def tag(text, max_len, generate_kwargs):
|
119 |
texts = get_texts(text, max_len)
|
120 |
all_tags = [get_tags(text, generate_kwargs) for text in texts]
|
121 |
flatten_tags = itertools.chain(*all_tags)
|