Spaces:
Sleeping
Sleeping
gopikrsmscs
commited on
Commit
β’
a61e8b3
1
Parent(s):
53a24c9
First model version
Browse files- README.md +7 -6
- app.py +29 -0
- model.py +33 -0
- model1.py +32 -0
- preparedata.py +53 -0
- requirements.txt +5 -0
- saved_corpus.pt +3 -0
- saved_corpus.txt +0 -0
- saved_corpus_list.txt +0 -0
- train.py +38 -0
README.md
CHANGED
@@ -1,13 +1,14 @@
|
|
1 |
---
|
2 |
-
title:
|
3 |
-
emoji:
|
4 |
-
colorFrom:
|
5 |
-
colorTo:
|
6 |
sdk: streamlit
|
7 |
-
sdk_version: 1.27.2
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
license: mit
|
11 |
---
|
12 |
|
13 |
-
|
|
|
|
|
|
1 |
---
|
2 |
+
title: iSeBetter
|
3 |
+
emoji: π©π½βπ»
|
4 |
+
colorFrom: purple
|
5 |
+
colorTo: blue
|
6 |
sdk: streamlit
|
|
|
7 |
app_file: app.py
|
8 |
pinned: false
|
9 |
license: mit
|
10 |
---
|
11 |
|
12 |
+
## iSeBetter: Transformer
|
13 |
+
|
14 |
+
Description: A semantic search transformer model fine-tuned on the PyTorch GitHub issues dataset, hosted on Hugging Face, and integrated with Streamlit for easy use.
|
app.py
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
import streamlit as st
|
3 |
+
import torch
|
4 |
+
from datasets import load_dataset
|
5 |
+
from sentence_transformers import SentenceTransformer, util
|
6 |
+
|
7 |
+
embedder = SentenceTransformer('all-mpnet-base-v2')
|
8 |
+
|
9 |
+
st.title("iSeBetter : Semantic Transformer")
|
10 |
+
st.header("Analyzing Patterns in Text")
|
11 |
+
|
12 |
+
|
13 |
+
text_input = st.text_area("Enter the issue details below:")
|
14 |
+
|
15 |
+
|
16 |
+
if st.button("Analyse the Issues"):
|
17 |
+
# Perform analysis (your existing code)
|
18 |
+
query_embedding = embedder.encode(text_input, convert_to_tensor=True)
|
19 |
+
corpus_embeddings = torch.load('saved_corpus.pt')
|
20 |
+
corpus_embeddings_name = torch.load('saved_corpus_list.txt')
|
21 |
+
cos_scores = util.cos_sim(query_embedding, corpus_embeddings)[0]
|
22 |
+
top_results = torch.topk(cos_scores, k=5)
|
23 |
+
|
24 |
+
# Results presentation
|
25 |
+
st.subheader("Top 5 Matched Results:")
|
26 |
+
result_table = "<table><tr><th>Matched Text</th><th>Score</th></tr>"
|
27 |
+
for score, idx in zip(top_results[0], top_results[1]):
|
28 |
+
st.markdown(f"- **{corpus_embeddings_name[idx]}** (Score: {score:.4f})")
|
29 |
+
st.progress(score.item())
|
model.py
ADDED
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
from sentence_transformers import SentenceTransformer, InputExample, losses
|
3 |
+
from torch.utils.data import DataLoader
|
4 |
+
from datasets import load_dataset
|
5 |
+
|
6 |
+
dataset = load_dataset("gopikrsmscs/torch-issues")
|
7 |
+
|
8 |
+
|
9 |
+
# Create InputExamples from your dataset
|
10 |
+
examples = []
|
11 |
+
for i, row in dataset['train'].iterrows():
|
12 |
+
title = row['Title']
|
13 |
+
body = row['Body']
|
14 |
+
examples.append(InputExample(texts=[title, body]))
|
15 |
+
|
16 |
+
# Load the pre-trained model
|
17 |
+
model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
|
18 |
+
|
19 |
+
# Define a DataLoader for training
|
20 |
+
train_dataloader = DataLoader(examples, shuffle=True, batch_size=16)
|
21 |
+
|
22 |
+
# Fine-tune the model
|
23 |
+
train_loss = losses.CosineSimilarityLoss(model)
|
24 |
+
|
25 |
+
model.fit(
|
26 |
+
train_objectives=[(train_dataloader, train_loss)],
|
27 |
+
epochs=2, # You can adjust the number of training epochs
|
28 |
+
warmup_steps=100,
|
29 |
+
optimizer_params={'lr': 1e-4},
|
30 |
+
)
|
31 |
+
|
32 |
+
# Save the fine-tuned model
|
33 |
+
model.save('iSeBetter')
|
model1.py
ADDED
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from sentence_transformers import SentenceTransformer, util
|
2 |
+
import torch
|
3 |
+
import pandas as pd
|
4 |
+
from sentence_transformers import SentenceTransformer, InputExample, losses
|
5 |
+
from torch.utils.data import DataLoader
|
6 |
+
from datasets import load_dataset
|
7 |
+
import pinecone
|
8 |
+
|
9 |
+
embedder = SentenceTransformer('all-mpnet-base-v2')
|
10 |
+
|
11 |
+
dataset = load_dataset("gopikrsmscs/torch-issues")
|
12 |
+
|
13 |
+
columns = dataset['train'].column_names
|
14 |
+
columns_to_keep = ["Title", "Body"]
|
15 |
+
columns_to_remove = set(columns_to_keep).symmetric_difference(columns)
|
16 |
+
dataset = dataset.remove_columns(columns_to_remove)
|
17 |
+
|
18 |
+
dataset_dict = dataset['train']
|
19 |
+
|
20 |
+
examples = []
|
21 |
+
for row in dataset_dict:
|
22 |
+
title = row['Title']
|
23 |
+
#body = row['Body']
|
24 |
+
examples.append(title)
|
25 |
+
|
26 |
+
file_paths = 'saved_corpus.txt'
|
27 |
+
torch.save(examples,file_paths)
|
28 |
+
corpus_embeddings = embedder.encode(examples, convert_to_tensor=True)
|
29 |
+
file_path = 'saved_corpus.pt'
|
30 |
+
|
31 |
+
# Save the tensor to the file
|
32 |
+
torch.save(corpus_embeddings, file_path)
|
preparedata.py
ADDED
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import requests
|
2 |
+
import csv
|
3 |
+
|
4 |
+
# Set the GitHub repository and the API endpoint
|
5 |
+
repo_url = "https://api.github.com/repos/pytorch/pytorch"
|
6 |
+
issues_endpoint = f"{repo_url}/issues"
|
7 |
+
|
8 |
+
# Initialize an empty list to store all issues
|
9 |
+
all_issues = []
|
10 |
+
|
11 |
+
page = 1
|
12 |
+
while True:
|
13 |
+
# Set the parameters for the API request
|
14 |
+
params = {
|
15 |
+
'state': 'open', # Change to 'closed' for closed issues
|
16 |
+
'per_page': 100, # Number of issues per page (adjust as needed)
|
17 |
+
'page': page
|
18 |
+
}
|
19 |
+
|
20 |
+
# Send the GET request to fetch issues
|
21 |
+
response = requests.get(issues_endpoint, params=params)
|
22 |
+
|
23 |
+
# Check if the request was successful (status code 200)
|
24 |
+
if response.status_code == 200:
|
25 |
+
issues = response.json()
|
26 |
+
if not issues:
|
27 |
+
break # No more issues to fetch
|
28 |
+
all_issues.extend(issues)
|
29 |
+
page += 1
|
30 |
+
else:
|
31 |
+
print(f"Failed to retrieve issues. Status code: {response.status_code}")
|
32 |
+
break
|
33 |
+
|
34 |
+
# Open a CSV file for writing
|
35 |
+
with open('torch_github_issues.csv', mode='w', newline='') as csv_file:
|
36 |
+
fieldnames = ['Serial Number', 'Issue Number', 'Title', 'Labels', 'Body','Comments']
|
37 |
+
writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
|
38 |
+
|
39 |
+
# Write the header row
|
40 |
+
writer.writeheader()
|
41 |
+
|
42 |
+
# Write the issues data to the CSV file
|
43 |
+
for serial, issue in enumerate(all_issues, start=1):
|
44 |
+
writer.writerow({
|
45 |
+
'Serial Number': serial,
|
46 |
+
'Issue Number': issue['number'],
|
47 |
+
'Title': issue['title'],
|
48 |
+
'Labels': ', '.join(label['name'] for label in issue['labels']),
|
49 |
+
'Body': issue['body'],
|
50 |
+
'Comments': issue['comments']
|
51 |
+
})
|
52 |
+
|
53 |
+
print(f"Data written to github_issues.csv")
|
requirements.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
streamlit==0.90.0
|
2 |
+
torch==1.10.0
|
3 |
+
datasets==1.13.0
|
4 |
+
sentence-transformers==2.1.0
|
5 |
+
transformers==5.5.1
|
saved_corpus.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5516ae0fa4699b9424f36d8588a69ec4947181d27eac25ca3519824a737366a4
|
3 |
+
size 18433205
|
saved_corpus.txt
ADDED
Binary file (433 kB). View file
|
|
saved_corpus_list.txt
ADDED
Binary file (433 kB). View file
|
|
train.py
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
|
2 |
+
import torch
|
3 |
+
|
4 |
+
# Load a pre-trained BERT model and tokenizer
|
5 |
+
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
|
6 |
+
model = BertForSequenceClassification.from_pretrained("bert-base-uncased")
|
7 |
+
|
8 |
+
# Prepare your dataset (replace with your dataset loading code)
|
9 |
+
train_texts = ["Text of issue 1", "Text of issue 2", ...]
|
10 |
+
labels = [0, 1, ...] # 0 for non-relevant, 1 for relevant
|
11 |
+
|
12 |
+
# Tokenize and convert your dataset to tensors
|
13 |
+
input_ids = tokenizer(train_texts, truncation=True, padding=True, return_tensors="pt")
|
14 |
+
labels = torch.tensor(labels)
|
15 |
+
|
16 |
+
# Set up data loaders
|
17 |
+
dataset = torch.utils.data.TensorDataset(input_ids["input_ids"], input_ids["attention_mask"], labels)
|
18 |
+
train_loader = torch.utils.data.DataLoader(dataset, batch_size=32)
|
19 |
+
|
20 |
+
# Define optimizer and loss
|
21 |
+
optimizer = AdamW(model.parameters(), lr=1e-5)
|
22 |
+
loss_fn = torch.nn.CrossEntropyLoss()
|
23 |
+
|
24 |
+
# Fine-tune the model
|
25 |
+
model.train()
|
26 |
+
for epoch in range(3): # Replace with desired number of epochs
|
27 |
+
for batch in train_loader:
|
28 |
+
input_ids, attention_mask, labels = batch
|
29 |
+
optimizer.zero_grad()
|
30 |
+
outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
|
31 |
+
loss = outputs.loss
|
32 |
+
loss.backward()
|
33 |
+
optimizer.step()
|
34 |
+
|
35 |
+
# Save the fine-tuned model
|
36 |
+
model.save_pretrained("/path/to/save/model")
|
37 |
+
|
38 |
+
# You can now use this model for semantic search.
|