added main and stm

Browse files

Files changed (8) hide show

config.json +34 -0
main.py +60 -0
model.safetensors +3 -0
special_tokens_map.json +7 -0
stm.py +49 -0
tokenizer.json +0 -0
tokenizer_config.json +55 -0
vocab.txt +0 -0

config.json ADDED Viewed

	@@ -0,0 +1,34 @@

+{
+  "_name_or_path": "distilbert-base-uncased",
+  "activation": "gelu",
+  "architectures": [
+    "DistilBertForTokenClassification"
+  ],
+  "attention_dropout": 0.1,
+  "dim": 768,
+  "dropout": 0.1,
+  "hidden_dim": 3072,
+  "id2label": {
+    "0": "O",
+    "1": "B-ABS",
+    "2": "I-ABS"
+  },
+  "initializer_range": 0.02,
+  "label2id": {
+    "B-ABS": 1,
+    "I-ABS": 2,
+    "O": 0
+  },
+  "max_position_embeddings": 512,
+  "model_type": "distilbert",
+  "n_heads": 12,
+  "n_layers": 6,
+  "pad_token_id": 0,
+  "qa_dropout": 0.1,
+  "seq_classif_dropout": 0.2,
+  "sinusoidal_pos_embds": false,
+  "tie_weights_": true,
+  "torch_dtype": "float32",
+  "transformers_version": "4.38.2",
+  "vocab_size": 30522
+}

main.py ADDED Viewed

	@@ -0,0 +1,60 @@

+import requests
+import spacy
+from spacy import displacy
+from dotenv import load_dotenv
+import os
+from stm import ShortTermMemory
+load_dotenv()
+api_key = os.getenv("API_KEY")
+API_URL = "https://api-inference.huggingface.co/models/cleopatro/Entity_Rec"
+headers = {"Authorization": f"Bearer {api_key}"}
+NER = spacy.load("en_core_web_sm")
+def extract_word_and_entity_group(dict):
+    words = []
+    result = []
+    for item in dict:
+        word = item['word']
+        words.append(word)
+    return words
+def get_abs(payload):
+	response = requests.post(API_URL, headers=headers, json=payload)
+	return response.json()
+def get_loc_time(sentence):
+    text1 = NER(sentence)
+    locations = []
+    times = []
+    for ent in text1.ents:
+        if ent.label_ == "GPE" or ent.label_ == "LOC":
+            locations.append(ent.text)
+        elif ent.label_ == "TIME" or ent.label_ == "DATE":
+            times.append(ent.text)
+    return locations, times
+def get_ent(sentence):
+	abs_dict = get_abs(sentence)
+	abs_tags = extract_word_and_entity_group(abs_dict)
+	loc_tags, time_tags = get_loc_time(sentence["inputs"])
+	return abs_tags, loc_tags, time_tags
+# output = get_ent({
+# 	"inputs": "today stock prices and home loans are a pain in san fransisco.",
+# })
+# print(output)
+# stm = ShortTermMemory(window_size=5, decay_rate=0.8)
+# stm.update('abstract', 'credit-card')
+# print(stm.get_memory())  # Output: {'abstract_entities': {'credit-card': 1}, 'locations': {}, 'times': {}}

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:16d964bd2589ac2a8af7b5b2afe2e38a8c09346ace42dbe036e8315d9dcd59e1
+size 265473092

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "cls_token": "[CLS]",
+  "mask_token": "[MASK]",
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "unk_token": "[UNK]"
+}

stm.py ADDED Viewed

	@@ -0,0 +1,49 @@

+from collections import defaultdict
+class ShortTermMemory:
+    def __init__(self, window_size=10, decay_rate=0.5):
+        self.abstract_entities = defaultdict(int)
+        self.locations = defaultdict(int)
+        self.times = defaultdict(int)
+        self.window_size = window_size
+        self.decay_rate = decay_rate
+    def update(self, entity_type, entity):
+        # Determine the appropriate dictionary based on the entity type
+        if entity_type == 'abstract':
+            entity_dict = self.abstract_entities
+        elif entity_type == 'location':
+            entity_dict = self.locations
+        elif entity_type == 'time':
+            entity_dict = self.times
+        else:
+            raise ValueError(f'Invalid entity type: {entity_type}')
+        # Increment the count for the given entity
+        entity_dict[entity] += 1
+        # Decay the counts of other entities in the same dictionary
+        for e, count in list(entity_dict.items()):
+            if e != entity:
+                entity_dict[e] = int(count * self.decay_rate)
+        # Remove entities with count <= 1
+        entity_dict = {e: count for e, count in entity_dict.items() if count > 1}
+        # Trim the dictionary to the window size
+        entity_dict = dict(sorted(entity_dict.items(), key=lambda x: x[1], reverse=True)[:self.window_size])
+        # Update the appropriate dictionary with the trimmed version
+        if entity_type == 'abstract':
+            self.abstract_entities = entity_dict
+        elif entity_type == 'location':
+            self.locations = entity_dict
+        elif entity_type == 'time':
+            self.times = entity_dict
+    def get_memory(self):
+        return {
+            'abstract_entities': self.abstract_entities,
+            'locations': self.locations,
+            'times': self.times
+        }

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,55 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100": {
+      "content": "[UNK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "101": {
+      "content": "[CLS]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "102": {
+      "content": "[SEP]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "103": {
+      "content": "[MASK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "clean_up_tokenization_spaces": true,
+  "cls_token": "[CLS]",
+  "do_lower_case": true,
+  "mask_token": "[MASK]",
+  "model_max_length": 512,
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "strip_accents": null,
+  "tokenize_chinese_chars": true,
+  "tokenizer_class": "DistilBertTokenizer",
+  "unk_token": "[UNK]"
+}

vocab.txt ADDED Viewed

The diff for this file is too large to render. See raw diff