parth parekh
commited on
Commit
•
7e63028
0
Parent(s):
working demo
Browse files- .gitattributes +35 -0
- .gitignore +1 -0
- Dockerfile +28 -0
- README.md +10 -0
- __pycache__/app.cpython-312.pyc +0 -0
- __pycache__/predictor.cpython-312.pyc +0 -0
- __pycache__/test.cpython-312.pyc +0 -0
- app.py +100 -0
- contact_sharing_epoch_1.pth +3 -0
- load_test.py +67 -0
- predictor.py +128 -0
- requirements.txt +70 -0
- test.py +166 -0
- vocab.pth +3 -0
.gitattributes
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
.venv
|
Dockerfile
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM python:3.12-slim
|
2 |
+
|
3 |
+
# Create a new user
|
4 |
+
RUN useradd -m user
|
5 |
+
|
6 |
+
WORKDIR /app
|
7 |
+
|
8 |
+
RUN apt-get update && apt-get install -y \
|
9 |
+
libglib2.0-0 \
|
10 |
+
libsm6 \
|
11 |
+
libxext6 \
|
12 |
+
libxrender-dev \
|
13 |
+
libgl1-mesa-glx \
|
14 |
+
wget \
|
15 |
+
&& rm -rf /var/lib/apt/lists/*
|
16 |
+
|
17 |
+
COPY requirements.txt .
|
18 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
19 |
+
|
20 |
+
COPY . .
|
21 |
+
|
22 |
+
# Change ownership of the /app directory to the new user
|
23 |
+
RUN chown -R user:user /app
|
24 |
+
|
25 |
+
# Switch to the new user
|
26 |
+
USER user
|
27 |
+
|
28 |
+
CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860", "--workers", "4"]
|
README.md
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
title: Contact Sharing Recognizer API
|
3 |
+
emoji: 🤙
|
4 |
+
colorFrom: indigo
|
5 |
+
colorTo: pink
|
6 |
+
sdk: docker
|
7 |
+
pinned: false
|
8 |
+
---
|
9 |
+
|
10 |
+
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
__pycache__/app.cpython-312.pyc
ADDED
Binary file (4.43 kB). View file
|
|
__pycache__/predictor.cpython-312.pyc
ADDED
Binary file (9.06 kB). View file
|
|
__pycache__/test.cpython-312.pyc
ADDED
Binary file (11.6 kB). View file
|
|
app.py
ADDED
@@ -0,0 +1,100 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from fastapi import FastAPI, HTTPException
|
2 |
+
from pydantic import BaseModel
|
3 |
+
import torch
|
4 |
+
from torch.nn.functional import softmax
|
5 |
+
import re
|
6 |
+
from predictor import predict, batch_predict # Assuming batch_predict is in predictor module
|
7 |
+
|
8 |
+
app = FastAPI(
|
9 |
+
title="Contact Information Detection API",
|
10 |
+
description="API for detecting contact information in text, great thanks to xxparthparekhxx/ContactShieldAI for the model",
|
11 |
+
version="1.0.0",
|
12 |
+
docs_url="/"
|
13 |
+
)
|
14 |
+
|
15 |
+
def preprocess_text(text):
|
16 |
+
# Remove all punctuation except for @ and . which are often used in email addresses
|
17 |
+
return re.sub(r'[^\w\s@.]', '', text)
|
18 |
+
|
19 |
+
class TextInput(BaseModel):
|
20 |
+
text: str
|
21 |
+
|
22 |
+
class BatchTextInput(BaseModel):
|
23 |
+
texts: list[str]
|
24 |
+
|
25 |
+
def check_regex_patterns(text):
|
26 |
+
patterns = [
|
27 |
+
r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', # Email
|
28 |
+
r'\b\d{3}[-.]?\d{3}[-.]?\d{4}\b', # Phone number
|
29 |
+
r'\b\d{5}(?:[-\s]\d{4})?\b', # ZIP code
|
30 |
+
r'\b\d+\s+[\w\s]+(?:street|st|avenue|ave|road|rd|highway|hwy|square|sq|trail|trl|drive|dr|court|ct|park|parkway|pkwy|circle|cir|boulevard|blvd)\b\s*(?:[a-z]+\s*\d{1,3})?(?:,\s*(?:apt|bldg|dept|fl|hngr|lot|pier|rm|ste|unit|#)\s*[a-z0-9-]+)?(?:,\s*[a-z]+\s*[a-z]{2}\s*\d{5}(?:-\d{4})?)?', # Street address
|
31 |
+
r'(?:http|https)://(?:www\.)?[a-zA-Z0-9-]+\.[a-zA-Z]{2,}(?:/[^\s]*)?' # Website URL
|
32 |
+
]
|
33 |
+
|
34 |
+
for pattern in patterns:
|
35 |
+
if re.search(pattern, text, re.IGNORECASE):
|
36 |
+
return True
|
37 |
+
return False
|
38 |
+
|
39 |
+
@app.post("/detect_contact", summary="Detect contact information in text")
|
40 |
+
async def detect_contact(input: TextInput):
|
41 |
+
try:
|
42 |
+
preprocessed_text = preprocess_text(input.text)
|
43 |
+
|
44 |
+
# First, check with regex patterns
|
45 |
+
if check_regex_patterns(preprocessed_text):
|
46 |
+
return {
|
47 |
+
"text": input.text,
|
48 |
+
"is_contact_info": True,
|
49 |
+
"method": "regex"
|
50 |
+
}
|
51 |
+
|
52 |
+
# If no regex patterns match, use the model
|
53 |
+
is_contact = predict(preprocessed_text)
|
54 |
+
return {
|
55 |
+
"text": input.text,
|
56 |
+
"is_contact_info": is_contact == 1,
|
57 |
+
"method": "model"
|
58 |
+
}
|
59 |
+
except Exception as e:
|
60 |
+
raise HTTPException(status_code=500, detail=str(e))
|
61 |
+
|
62 |
+
@app.post("/batch_detect_contact", summary="Detect contact information in batch of texts")
|
63 |
+
async def batch_detect_contact(inputs: BatchTextInput):
|
64 |
+
try:
|
65 |
+
# Preprocess all texts
|
66 |
+
preprocessed_texts = [preprocess_text(text) for text in inputs.texts]
|
67 |
+
|
68 |
+
# First, use regex to check patterns
|
69 |
+
regex_results = [check_regex_patterns(text) for text in preprocessed_texts]
|
70 |
+
|
71 |
+
|
72 |
+
# For texts where regex doesn't detect anything, use the model
|
73 |
+
texts_for_model = [text for text, regex_match in zip(preprocessed_texts, regex_results) if not regex_match]
|
74 |
+
if texts_for_model:
|
75 |
+
model_results = batch_predict(texts_for_model)
|
76 |
+
else:
|
77 |
+
model_results = []
|
78 |
+
|
79 |
+
# Prepare final results
|
80 |
+
results = []
|
81 |
+
model_idx = 0
|
82 |
+
for i, text in enumerate(preprocessed_texts):
|
83 |
+
if regex_results[i]:
|
84 |
+
results.append({
|
85 |
+
"text": inputs.texts[i],
|
86 |
+
"is_contact_info": True,
|
87 |
+
"method": "regex"
|
88 |
+
})
|
89 |
+
else:
|
90 |
+
is_contact = model_results[model_idx]
|
91 |
+
results.append({
|
92 |
+
"text": inputs.texts[i],
|
93 |
+
"is_contact_info": bool(is_contact), # Convert numpy bool
|
94 |
+
"method": "model"
|
95 |
+
})
|
96 |
+
model_idx += 1
|
97 |
+
|
98 |
+
return results
|
99 |
+
except Exception as e:
|
100 |
+
raise HTTPException(status_code=500, detail=str(e))
|
contact_sharing_epoch_1.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:bdb70e711c212856ce3df95b82afbae57b8fc34243b3f541ecd65963fa81fd92
|
3 |
+
size 813497259
|
load_test.py
ADDED
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import asyncio
|
2 |
+
import aiohttp
|
3 |
+
import json
|
4 |
+
from tqdm.asyncio import tqdm
|
5 |
+
import time
|
6 |
+
from test import test_texts
|
7 |
+
|
8 |
+
url = "https://vidhitmakvana1-contact-sharing-recognizer-api.hf.space/detect_contact"
|
9 |
+
concurrent_requests = 2
|
10 |
+
|
11 |
+
async def process_text(session, text, semaphore):
|
12 |
+
payload = {"text": text}
|
13 |
+
headers = {"Content-Type": "application/json"}
|
14 |
+
|
15 |
+
async with semaphore:
|
16 |
+
start_time = time.time()
|
17 |
+
while True:
|
18 |
+
async with session.post(url, data=json.dumps(payload), headers=headers) as response:
|
19 |
+
if response.status == 200:
|
20 |
+
result = await response.json()
|
21 |
+
end_time = time.time()
|
22 |
+
result['response_time'] = end_time - start_time
|
23 |
+
return result
|
24 |
+
elif response.status == 429:
|
25 |
+
print(f"Rate limit exceeded. Waiting for 60 seconds before retrying...")
|
26 |
+
await asyncio.sleep(60)
|
27 |
+
else:
|
28 |
+
print(f"Error for text: {text}")
|
29 |
+
print(f"Status code: {response.status}")
|
30 |
+
print(f"Response: {await response.text()}")
|
31 |
+
return None
|
32 |
+
|
33 |
+
async def main():
|
34 |
+
semaphore = asyncio.Semaphore(concurrent_requests)
|
35 |
+
async with aiohttp.ClientSession() as session:
|
36 |
+
tasks = [process_text(session, text, semaphore) for text in [*test_texts,*test_texts,*test_texts,*test_texts,*test_texts,*test_texts,*test_texts,*test_texts,*test_texts,*test_texts,*test_texts,*test_texts,*test_texts,*test_texts,*test_texts,*test_texts,*test_texts,*test_texts,*test_texts,*test_texts,*test_texts,*test_texts,*test_texts,*test_texts,*test_texts,*test_texts,*test_texts,*test_texts,*test_texts,*test_texts,*test_texts,*test_texts,*test_texts,*test_texts,*test_texts,*test_texts]]
|
37 |
+
results = await tqdm.gather(*tasks)
|
38 |
+
|
39 |
+
correct_predictions = 0
|
40 |
+
total_predictions = len(results)
|
41 |
+
total_response_time = 0
|
42 |
+
|
43 |
+
for text, result in zip(test_texts, results):
|
44 |
+
if result:
|
45 |
+
print(f"Text: {result['text']}")
|
46 |
+
print(f"Contact Probability: {result['contact_probability']:.4f}")
|
47 |
+
print(f"Is Contact Info: {result['is_contact_info']}")
|
48 |
+
print(f"Response Time: {result['response_time']:.4f} seconds")
|
49 |
+
print("---")
|
50 |
+
|
51 |
+
if result['is_contact_info']:
|
52 |
+
correct_predictions += 1
|
53 |
+
|
54 |
+
total_response_time += result['response_time']
|
55 |
+
|
56 |
+
accuracy = correct_predictions / (total_predictions * 37)
|
57 |
+
average_response_time = total_response_time / total_predictions
|
58 |
+
print(f"Accuracy: {accuracy:.2f}")
|
59 |
+
print(f"Average Response Time: {average_response_time:.4f} seconds")
|
60 |
+
|
61 |
+
if __name__ == "__main__":
|
62 |
+
while True:
|
63 |
+
start_time = time.time()
|
64 |
+
asyncio.run(main())
|
65 |
+
end_time = time.time()
|
66 |
+
total_time = end_time - start_time
|
67 |
+
print(f"\nTotal execution time: {total_time:.2f} seconds")
|
predictor.py
ADDED
@@ -0,0 +1,128 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import torch.nn as nn
|
3 |
+
import torch.nn.functional as F
|
4 |
+
from torchtext.vocab import build_vocab_from_iterator, GloVe
|
5 |
+
from torchtext.data.utils import get_tokenizer
|
6 |
+
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
7 |
+
|
8 |
+
class ContactSharingClassifier(nn.Module):
|
9 |
+
def __init__(self, vocab_size, embed_dim, num_filters, filter_sizes, lstm_hidden_dim, output_dim, dropout, pad_idx):
|
10 |
+
super().__init__()
|
11 |
+
self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=pad_idx)
|
12 |
+
self.lstm = nn.LSTM(embed_dim, lstm_hidden_dim, bidirectional=True, batch_first=True)
|
13 |
+
self.convs = nn.ModuleList([
|
14 |
+
nn.Conv1d(in_channels=lstm_hidden_dim*2, out_channels=num_filters, kernel_size=fs)
|
15 |
+
for fs in filter_sizes
|
16 |
+
])
|
17 |
+
self.fc1 = nn.Linear(len(filter_sizes) * num_filters, len(filter_sizes) * num_filters // 2)
|
18 |
+
self.fc2 = nn.Linear(len(filter_sizes) * num_filters // 2, output_dim)
|
19 |
+
self.dropout = nn.Dropout(dropout)
|
20 |
+
self.layer_norm = nn.LayerNorm(len(filter_sizes) * num_filters)
|
21 |
+
|
22 |
+
def forward(self, text):
|
23 |
+
embedded = self.embedding(text)
|
24 |
+
lstm_out, _ = self.lstm(embedded)
|
25 |
+
lstm_out = lstm_out.permute(0, 2, 1)
|
26 |
+
conved = [F.relu(conv(lstm_out)) for conv in self.convs]
|
27 |
+
pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
|
28 |
+
cat = self.dropout(torch.cat(pooled, dim=1))
|
29 |
+
cat = self.layer_norm(cat)
|
30 |
+
x = F.relu(self.fc1(cat))
|
31 |
+
x = self.dropout(x)
|
32 |
+
return self.fc2(x)
|
33 |
+
|
34 |
+
# Initialize tokenizer and vocabulary
|
35 |
+
tokenizer = get_tokenizer("spacy", language="en_core_web_sm")
|
36 |
+
vocab = torch.load('vocab.pth') # Assuming you've saved the vocabulary
|
37 |
+
|
38 |
+
# Define text pipeline
|
39 |
+
def text_pipeline(x):
|
40 |
+
return [vocab[token] for token in tokenizer(x)]
|
41 |
+
|
42 |
+
# Model parameters
|
43 |
+
VOCAB_SIZE = len(vocab)
|
44 |
+
EMBED_DIM = 600
|
45 |
+
NUM_FILTERS = 600
|
46 |
+
FILTER_SIZES = [3, 4, 5, 6, 7, 8, 9, 10]
|
47 |
+
LSTM_HIDDEN_DIM = 768
|
48 |
+
OUTPUT_DIM = 2
|
49 |
+
DROPOUT = 0.5
|
50 |
+
PAD_IDX = vocab["<pad>"]
|
51 |
+
|
52 |
+
# Load the model
|
53 |
+
|
54 |
+
model = ContactSharingClassifier(VOCAB_SIZE, EMBED_DIM, NUM_FILTERS, FILTER_SIZES, LSTM_HIDDEN_DIM, OUTPUT_DIM, DROPOUT, PAD_IDX)
|
55 |
+
model.load_state_dict(torch.load('contact_sharing_epoch_1.pth', map_location=device))
|
56 |
+
model.to(device)
|
57 |
+
model.eval()
|
58 |
+
|
59 |
+
# Test sentences
|
60 |
+
test_sentences = [
|
61 |
+
"You can reach me at my electronic mail address, it's my first name dot last name at that popular search engine company's mail service.",
|
62 |
+
"Call me on my cellular device, the digits are the same as the year the Declaration of Independence was signed, followed by my birth year, twice.",
|
63 |
+
"Visit my online presence at triple w dot my full name without spaces or punctuation dot com.",
|
64 |
+
"Send a message to username 'not_my_real_name' on that instant messaging platform that starts with 'disc' and ends with 'ord'.",
|
65 |
+
"My contact info is hidden in this sentence: Eight Six Seven Five Three Oh Nine.",
|
66 |
+
"Find me on the professional networking site, just search for my name plus 'software engineer in San Francisco'.",
|
67 |
+
"My handle on the bird-themed social media platform is at symbol followed by 'definitely_not_my_email_address'.",
|
68 |
+
"You know that video sharing site? My channel is there, just add 'cool_coder_' before my full name, all lowercase.",
|
69 |
+
"I'm listed in the phone book under 'Smith, John' but replace 'Smith' with my actual last name and 'John' with my first name.",
|
70 |
+
"My contact details are encrypted: Rot13('zl.rznvy@tznvy.pbz')",
|
71 |
+
|
72 |
+
# New non-contact sharing examples
|
73 |
+
"The weather today is absolutely beautiful, perfect for a picnic in the park.",
|
74 |
+
"I'm really excited about the new sci-fi movie coming out next month.",
|
75 |
+
"Did you hear about the latest advancements in artificial intelligence? It's fascinating!",
|
76 |
+
"I'm planning to go hiking this weekend in the nearby mountains.",
|
77 |
+
"The recipe calls for two cups of flour and a pinch of salt.",
|
78 |
+
"The annual tech conference will be held virtually this year due to ongoing health concerns.",
|
79 |
+
"I've been learning to play the guitar for the past six months. It's challenging but rewarding.",
|
80 |
+
"The local farmer's market has the freshest produce every Saturday morning.",
|
81 |
+
"Did you catch the game last night? It was an incredible comeback in the final quarter!",
|
82 |
+
"Lets do '42069' tonight it will be really fun what do you say ?"
|
83 |
+
]
|
84 |
+
|
85 |
+
# JIT Script the model for faster inference
|
86 |
+
scripted_model = torch.jit.script(model)
|
87 |
+
|
88 |
+
# Preallocate padding tensor to avoid repeated memory allocation
|
89 |
+
MAX_LEN = max(FILTER_SIZES)
|
90 |
+
padding_tensor = torch.zeros(1, MAX_LEN, dtype=torch.long).to(device)
|
91 |
+
|
92 |
+
# Prediction function using JIT and inference optimizations
|
93 |
+
def predict(text):
|
94 |
+
with torch.inference_mode(): # Use inference mode instead of no_grad
|
95 |
+
inputs = torch.tensor([text_pipeline(text)]).to(device)
|
96 |
+
|
97 |
+
# Perform padding if necessary
|
98 |
+
if inputs.size(1) < MAX_LEN:
|
99 |
+
inputs = torch.cat([inputs, padding_tensor[:, :MAX_LEN - inputs.size(1)]], dim=1)
|
100 |
+
|
101 |
+
# Pass inputs through the scripted model
|
102 |
+
outputs = scripted_model(inputs)
|
103 |
+
|
104 |
+
# Return predicted class
|
105 |
+
return torch.argmax(outputs, dim=1).item()
|
106 |
+
|
107 |
+
def batch_predict(texts):
|
108 |
+
with torch.inference_mode(): # Use inference mode for better performance
|
109 |
+
# Tokenize and convert to tensors
|
110 |
+
inputs = [torch.tensor(text_pipeline(text)) for text in texts]
|
111 |
+
|
112 |
+
# Pad all sequences to the length of the longest one in the batch
|
113 |
+
max_len = max(len(seq) for seq in inputs)
|
114 |
+
padded_inputs = torch.stack([torch.cat([seq, torch.zeros(max_len - len(seq), dtype=torch.long)]) for seq in inputs]).to(device)
|
115 |
+
|
116 |
+
# Pass the batch through the scripted model
|
117 |
+
outputs = scripted_model(padded_inputs)
|
118 |
+
|
119 |
+
# Return predicted classes for each sentence
|
120 |
+
predictions = torch.argmax(outputs, dim=1).cpu().numpy()
|
121 |
+
return predictions
|
122 |
+
|
123 |
+
# Test the sentences
|
124 |
+
for i, sentence in enumerate(test_sentences, 1):
|
125 |
+
prediction = predict(sentence)
|
126 |
+
result = "Contains contact info" if prediction == 1 else "No contact info"
|
127 |
+
print(f"Sentence {i}: {result}")
|
128 |
+
print(f"Text: {sentence}\n")
|
requirements.txt
ADDED
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
annotated-types==0.7.0
|
2 |
+
anyio==4.6.0
|
3 |
+
blis==0.7.11
|
4 |
+
catalogue==2.0.10
|
5 |
+
certifi==2024.8.30
|
6 |
+
charset-normalizer==3.3.2
|
7 |
+
click==8.1.7
|
8 |
+
cloudpathlib==0.19.0
|
9 |
+
colorama==0.4.6
|
10 |
+
confection==0.1.5
|
11 |
+
cymem==2.0.8
|
12 |
+
distro==1.9.0
|
13 |
+
en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl#sha256=86cc141f63942d4b2c5fcee06630fd6f904788d2f0ab005cce45aadb8fb73889
|
14 |
+
fastapi==0.115.0
|
15 |
+
filelock==3.13.1
|
16 |
+
fsspec==2024.2.0
|
17 |
+
greenlet==3.1.1
|
18 |
+
groq==0.11.0
|
19 |
+
h11==0.14.0
|
20 |
+
httpcore==1.0.5
|
21 |
+
httpx==0.27.2
|
22 |
+
huggingface-hub==0.25.1
|
23 |
+
idna==3.10
|
24 |
+
Jinja2==3.1.3
|
25 |
+
langcodes==3.4.0
|
26 |
+
language_data==1.2.0
|
27 |
+
marisa-trie==1.2.0
|
28 |
+
markdown-it-py==3.0.0
|
29 |
+
MarkupSafe==2.1.5
|
30 |
+
mdurl==0.1.2
|
31 |
+
mpmath==1.3.0
|
32 |
+
murmurhash==1.0.10
|
33 |
+
networkx==3.2.1
|
34 |
+
numpy==1.26.3
|
35 |
+
packaging==24.1
|
36 |
+
pillow==10.2.0
|
37 |
+
preshed==3.0.9
|
38 |
+
pydantic==2.9.2
|
39 |
+
pydantic_core==2.23.4
|
40 |
+
Pygments==2.18.0
|
41 |
+
PyYAML==6.0.2
|
42 |
+
regex==2024.9.11
|
43 |
+
requests==2.32.3
|
44 |
+
rich==13.8.1
|
45 |
+
safetensors==0.4.5
|
46 |
+
setuptools==70.0.0
|
47 |
+
shellingham==1.5.4
|
48 |
+
smart-open==7.0.4
|
49 |
+
sniffio==1.3.1
|
50 |
+
spacy==3.7.6
|
51 |
+
spacy-legacy==3.0.12
|
52 |
+
spacy-loggers==1.0.5
|
53 |
+
SQLAlchemy==2.0.35
|
54 |
+
srsly==2.4.8
|
55 |
+
starlette==0.38.5
|
56 |
+
sympy==1.12
|
57 |
+
thinc==8.2.5
|
58 |
+
tokenizers==0.19.1
|
59 |
+
torch==2.2.0
|
60 |
+
torchdata==0.7.1
|
61 |
+
torchtext==0.16.2
|
62 |
+
tqdm==4.66.5
|
63 |
+
transformers==4.44.2
|
64 |
+
typer==0.12.5
|
65 |
+
typing_extensions==4.12.2
|
66 |
+
urllib3==2.2.3
|
67 |
+
uvicorn==0.30.6
|
68 |
+
wasabi==1.1.3
|
69 |
+
weasel==0.4.1
|
70 |
+
wrapt==1.16.0
|
test.py
ADDED
@@ -0,0 +1,166 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import asyncio
|
2 |
+
import aiohttp
|
3 |
+
import json
|
4 |
+
from tqdm.asyncio import tqdm
|
5 |
+
|
6 |
+
test_texts = [
|
7 |
+
"You can reach me at triple eight, then the square of 7, followed by 2^10",
|
8 |
+
"Drop a line to first_name [underscore] last_name at that company with a fruit logo dot com",
|
9 |
+
"Find me on the platform where professionals connect: J. Doe, Senior Developer at TechCorp",
|
10 |
+
"Message me on that app with the ghost icon: @ShadowWhisperer2023",
|
11 |
+
"Contact via carrier pigeon: coordinates 40.7128° N, 74.0060° W",
|
12 |
+
"Ping me on the federated network: @cooluser@mastodon.social",
|
13 |
+
"My contact is the reverse of moc.elpmaxe@eodnhoj",
|
14 |
+
"Reach out using morse: -... -.-- -....- . -- .- .. .-..",
|
15 |
+
"Find me on the platform with blue checkmarks: @RealJohnDoe (parody)",
|
16 |
+
"Send a message to username 'l33tc0d3r' on that platform for developers",
|
17 |
+
"You can locate me at the place where the streets have no name, in the city of angels",
|
18 |
+
"My digits are the Fibonacci sequence up to 21, concatenated",
|
19 |
+
"Contact: foxtrot oscar oscar at bravo alpha romeo dot charlie oscar mike",
|
20 |
+
"Beep me at the number you get when you multiply 555 by 1.5, then add 867-5309",
|
21 |
+
"I'm on that app where you share shortvideos: @Dancing2023",
|
22 |
+
"Reach out via electronic mail to 'lastnamefirstinitial' at that search engine company dot com",
|
23 |
+
"Call me at the number you get when you solve this equation: 2x + 5 = 13, then 555-MATH",
|
24 |
+
"My handle on that photo-sharing app is @SunsetSnapper_42",
|
25 |
+
"You can find me at the intersection of Binary Boulevard and Algorithm Avenue",
|
26 |
+
"Contact info: romeo oscar charlie kilo echo tango mike alpha november at zulu uniform lima uniform dot india oscar",
|
27 |
+
"Find me at 51.4778° N, and solve for x: x - 0.0019 = 0.1278° W",
|
28 |
+
"DM me at 📧👤💻🐦. Guess the platform 😉",
|
29 |
+
"If you add 2 to the area code of Los Angeles, you'll find the first 3 digits of my number",
|
30 |
+
"Ping me on the platform with 2 birds in its logo (and no, it's not a zoo!)",
|
31 |
+
"You can decode my email address: base64 for JmRvZGVAc2FtcGxlLmNvbQ==",
|
32 |
+
"You’ll find me on the platform that rhymes with 'squeaker' and involves chirps",
|
33 |
+
"If you reverse the letters of com.gmail@john and remove 'moc', you'll get my contact",
|
34 |
+
"For inquiries, try contacting me at 'first name.last name', but think of the sound fruit makes when it's dropped",
|
35 |
+
"Use morse to reach out: dash dot dot dash underscore underscore dash dot dot (first name at techcorp dot com)",
|
36 |
+
"Contact: solve 5x - 3 = 12 for x, that’s my lucky number for the area code, followed by the square root of 144 for the rest",
|
37 |
+
"Reach out on that site where professionals connect, my name rhymes with 'noe' and I’m a senior engineer at T-Corp",
|
38 |
+
"Shoot me a message on the photo-sharing app where sunsets get all the likes: handle is the same as my name in reverse",
|
39 |
+
"If you count the number of words in 'five stars' you'll get the first two digits of my handle on that coding platform",
|
40 |
+
"My email is hidden: find the cube root of 27, followed by the first name of a famous fruit and 'dot com'",
|
41 |
+
"If you multiply the number of days in March by 5, you'll get my contact digits",
|
42 |
+
"Contact me on the short-video app, my handle starts with a 'D' and ends with '23'!",
|
43 |
+
"Try to find me where algorithms reign and the search begins: think of a query that contains my last name and 'solutions'",
|
44 |
+
"Use binary to get my location: 01000101 01001110 01000111",
|
45 |
+
"You can ping me at 'bestcoder42' on the app where code flows like water",
|
46 |
+
"My digits: sum of first four Fibonacci numbers for the area code, and the next three for the phone number",
|
47 |
+
"Find me at 51.4778° N, and solve for x: x - 0.0019 = 0.1278° W",
|
48 |
+
"DM me at 📧👤💻🐦. Guess the platform 😉",
|
49 |
+
"If you add 2 to the area code of Los Angeles, you'll find the rest of my digits hiding nearby",
|
50 |
+
"Ping me at 'FirstnameLastname reversed' at that search company 🧐",
|
51 |
+
"The sum of the first two primes gives you the first part of my number, and 10 squared gets you the rest",
|
52 |
+
"Drop a message on the 'app named after a bird' to @JohnDoe2024 🌐",
|
53 |
+
"Morse this one: .... . .-.. .-.. --- @ secret-agent",
|
54 |
+
"Let’s connect: 3rd letter of my last name, then an underscore, then my birth year at fruit-company dot com 🍏",
|
55 |
+
"I'm on the platform for professionals but my handle is just a smiley face, hint hint 😉",
|
56 |
+
"Look up the coordinates of Big Ben and you might just find where I hang out 🕰️",
|
57 |
+
"Combine the atomic number of helium with my favorite fruit and you'll get my email",
|
58 |
+
"Find me at 'underscore emoji fan' at the app where people share funny short videos 🤳",
|
59 |
+
"Think of the number 404, then multiply it by 2, that’s the area code. The rest is easy!",
|
60 |
+
"I'm always up for a chat, just decode 01000011 01100001 01101100 01101100",
|
61 |
+
"I’m @SilentWhisper42 on the app where conversations vanish into thin air 👻",
|
62 |
+
"Track me down with this: Alpha-Bravo-Charlie at that company with flying machines ✈️",
|
63 |
+
"Ever heard of Fibonacci? My digits follow the pattern, up to 21",
|
64 |
+
"Search for the name of the singer of 'Rocket Man,' and you'll have part of my contact info 🧑🚀",
|
65 |
+
"Just send a message to 'TechWizard' at the email service that rhymes with whale-mail 🐋",
|
66 |
+
"My username on that site for devs is 'leet_hacker', but you’ll need to solve for x to figure out the rest!",
|
67 |
+
"My digits? Picture the number of planets in the solar system before 2006, then square it.",
|
68 |
+
"If you know the atomic numbers of oxygen and hydrogen, combine them and you have my first two digits.",
|
69 |
+
"Contact me where knowledge is power, at the symbol of illumination followed by 'dot org'.",
|
70 |
+
"I'm @user and you'll find me on the app where one tweets, but reverse that bird's sound first.",
|
71 |
+
"Think of the area code for New York, subtract one, and you’re almost there.",
|
72 |
+
"Reach out at 'wizard@', then imagine the home of the brave and the land of the free, followed by 'com'.",
|
73 |
+
"My handle is a palindrome on that platform where people share their lives one square at a time.",
|
74 |
+
"Find me at the intersection of 7 squared and the cube root of 8, you'll know the digits.",
|
75 |
+
"Ping me at 'Firstname reversed' dot 'company with a shopping cart logo'.",
|
76 |
+
"Send a message to the name of the president in 1993 at the platform where developers share code.",
|
77 |
+
"You can reach me at the number that shares its name with a famous Chicago bull, then add 10.",
|
78 |
+
"Reach out on the platform with the blue checkmarks, where I’m known as '56/8'.",
|
79 |
+
"The username is easy if you know your ASCII: 83 117 110 83 101 116 52 50.",
|
80 |
+
"For contact info, divide the year Armstrong walked on the moon by two and add the last prime number.",
|
81 |
+
"You can email me at the world's largest retailer with a name that rhymes with 'Hamazon'.",
|
82 |
+
"Catch me on the app where professionals hang out: it’s the opposite of 'InTouch'.",
|
83 |
+
"Look for me on the 'bird app' where my handle is my initials followed by the number of days in a leap year.",
|
84 |
+
"Ping me at 'Jupiter's largest moon' dot 'the company that sends rockets into space'.",
|
85 |
+
"Reach out at the sum of the angles in a triangle, followed by 'degrees at mail dot com'.",
|
86 |
+
"Message me where bytes are shared: I go by '@user_hexadecimal_4D2' on that site.",
|
87 |
+
"You can send it to 🌍 world_dot_explorer @ 'web page where you explore the world'.",
|
88 |
+
"数字 4 (Chinese), then 'underscore', then 'techie' at the search giant.",
|
89 |
+
"Write to me at the country with a maple leaf symbol, at their email provider.",
|
90 |
+
"Feel free to ping me at Жака at mail dot ru (that’s Russian for Jack).",
|
91 |
+
"If you take the French word for 'sun' and add 'shine', that’s where you can reach me.",
|
92 |
+
"My digits? They hide in plain sight: 42-4*8+18. Just subtract the stars.",
|
93 |
+
"Drop me a line at 'developer' followed by the country code for India, dot com.",
|
94 |
+
"Where to find me? It's obvious: 'who's' dot 'this', at the dot that ends with 'gov'.",
|
95 |
+
"A long story short: email me at 'fruit-company', the one that used to be a tree 🌳.",
|
96 |
+
"You'll get my email by figuring out: my first pet’s name, the city I grew up in, dot org.",
|
97 |
+
"My number? It's encrypted as SHA-256. Just decode it and you'll know!",
|
98 |
+
"Write to me in the ancient language of the Romans: 'maximus at something_prime dot com'.",
|
99 |
+
"Reach out to the winner of 2022's football world cup at 'world champions dot com'.",
|
100 |
+
"Find me at the place where the Eiffel Tower stands tall, at dot 'home of baguettes'.",
|
101 |
+
"Hit me up via snail mail: Just translate 'rabbit' into Italian and add 'at Italian mail'.",
|
102 |
+
"My digits form a prime sequence starting from 11, just keep counting!",
|
103 |
+
"For my number, follow the clues hidden in Da Vinci's most famous painting."
|
104 |
+
|
105 |
+
]
|
106 |
+
import time
|
107 |
+
# url = "https://vidhitmakvana1-contact-sharing-recognizer-api.hf.space/batch_detect_contact"
|
108 |
+
url = "http://localhost:8000/batch_detect_contact"
|
109 |
+
|
110 |
+
async def process_batch(session, texts):
|
111 |
+
payload = {"texts": texts}
|
112 |
+
headers = {"Content-Type": "application/json"}
|
113 |
+
|
114 |
+
start_time = time.time()
|
115 |
+
async with session.post(url, data=json.dumps(payload), headers=headers) as response:
|
116 |
+
if response.status == 200:
|
117 |
+
results = await response.json()
|
118 |
+
end_time = time.time()
|
119 |
+
for result in results:
|
120 |
+
result['response_time'] = (end_time - start_time) / len(texts)
|
121 |
+
return results
|
122 |
+
else:
|
123 |
+
print(f"Error for batch")
|
124 |
+
print(f"Status code: {response.status}")
|
125 |
+
print(f"Response: {await response.text()}")
|
126 |
+
return None
|
127 |
+
|
128 |
+
async def main():
|
129 |
+
# Inflate test_texts
|
130 |
+
inflated_texts = test_texts * 100 # Multiply the test set by 10
|
131 |
+
|
132 |
+
async with aiohttp.ClientSession() as session:
|
133 |
+
batch_size = 1000
|
134 |
+
batches = [inflated_texts[i:i + batch_size] for i in range(0, len(inflated_texts), batch_size)]
|
135 |
+
|
136 |
+
tasks = [process_batch(session, batch) for batch in batches]
|
137 |
+
all_results = await tqdm.gather(*tasks)
|
138 |
+
|
139 |
+
results = [item for sublist in all_results for item in sublist if sublist]
|
140 |
+
|
141 |
+
correct_predictions = 0
|
142 |
+
total_predictions = len(results)
|
143 |
+
total_response_time = 0
|
144 |
+
|
145 |
+
for result in results:
|
146 |
+
if result:
|
147 |
+
print(f"Text: {result['text']}")
|
148 |
+
print(f"Is Contact Info: {result['is_contact_info']}")
|
149 |
+
print(f"Method: {result['method']}")
|
150 |
+
print(f"Response Time: {result['response_time']:.4f} seconds")
|
151 |
+
print("---")
|
152 |
+
|
153 |
+
# Assuming all texts in test_texts are actually contact information
|
154 |
+
if result['is_contact_info']:
|
155 |
+
correct_predictions += 1
|
156 |
+
|
157 |
+
total_response_time += result['response_time']
|
158 |
+
|
159 |
+
accuracy = correct_predictions / total_predictions
|
160 |
+
average_response_time = total_response_time / total_predictions
|
161 |
+
print(f"Accuracy: {accuracy:.2f}")
|
162 |
+
print(f"Average Response Time: {average_response_time:.4f} seconds")
|
163 |
+
|
164 |
+
if __name__ == "__main__":
|
165 |
+
while True:
|
166 |
+
asyncio.run(main())
|
vocab.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:28edf2ae44d144c4566f0e5f95b856391166ac138ee578bac7fd9db151e1790a
|
3 |
+
size 5184491
|