gopikrsmscs commited on
Commit
a61e8b3
β€’
1 Parent(s): 53a24c9

First model version

Browse files
Files changed (10) hide show
  1. README.md +7 -6
  2. app.py +29 -0
  3. model.py +33 -0
  4. model1.py +32 -0
  5. preparedata.py +53 -0
  6. requirements.txt +5 -0
  7. saved_corpus.pt +3 -0
  8. saved_corpus.txt +0 -0
  9. saved_corpus_list.txt +0 -0
  10. train.py +38 -0
README.md CHANGED
@@ -1,13 +1,14 @@
1
  ---
2
- title: ISeBetter
3
- emoji: πŸ“ˆ
4
- colorFrom: yellow
5
- colorTo: pink
6
  sdk: streamlit
7
- sdk_version: 1.27.2
8
  app_file: app.py
9
  pinned: false
10
  license: mit
11
  ---
12
 
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
1
  ---
2
+ title: iSeBetter
3
+ emoji: πŸ‘©πŸ½β€πŸ’»
4
+ colorFrom: purple
5
+ colorTo: blue
6
  sdk: streamlit
 
7
  app_file: app.py
8
  pinned: false
9
  license: mit
10
  ---
11
 
12
+ ## iSeBetter: Transformer
13
+
14
+ Description: A semantic search transformer model fine-tuned on the PyTorch GitHub issues dataset, hosted on Hugging Face, and integrated with Streamlit for easy use.
app.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import streamlit as st
3
+ import torch
4
+ from datasets import load_dataset
5
+ from sentence_transformers import SentenceTransformer, util
6
+
7
+ embedder = SentenceTransformer('all-mpnet-base-v2')
8
+
9
+ st.title("iSeBetter : Semantic Transformer")
10
+ st.header("Analyzing Patterns in Text")
11
+
12
+
13
+ text_input = st.text_area("Enter the issue details below:")
14
+
15
+
16
+ if st.button("Analyse the Issues"):
17
+ # Perform analysis (your existing code)
18
+ query_embedding = embedder.encode(text_input, convert_to_tensor=True)
19
+ corpus_embeddings = torch.load('saved_corpus.pt')
20
+ corpus_embeddings_name = torch.load('saved_corpus_list.txt')
21
+ cos_scores = util.cos_sim(query_embedding, corpus_embeddings)[0]
22
+ top_results = torch.topk(cos_scores, k=5)
23
+
24
+ # Results presentation
25
+ st.subheader("Top 5 Matched Results:")
26
+ result_table = "<table><tr><th>Matched Text</th><th>Score</th></tr>"
27
+ for score, idx in zip(top_results[0], top_results[1]):
28
+ st.markdown(f"- **{corpus_embeddings_name[idx]}** (Score: {score:.4f})")
29
+ st.progress(score.item())
model.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from sentence_transformers import SentenceTransformer, InputExample, losses
3
+ from torch.utils.data import DataLoader
4
+ from datasets import load_dataset
5
+
6
+ dataset = load_dataset("gopikrsmscs/torch-issues")
7
+
8
+
9
+ # Create InputExamples from your dataset
10
+ examples = []
11
+ for i, row in dataset['train'].iterrows():
12
+ title = row['Title']
13
+ body = row['Body']
14
+ examples.append(InputExample(texts=[title, body]))
15
+
16
+ # Load the pre-trained model
17
+ model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
18
+
19
+ # Define a DataLoader for training
20
+ train_dataloader = DataLoader(examples, shuffle=True, batch_size=16)
21
+
22
+ # Fine-tune the model
23
+ train_loss = losses.CosineSimilarityLoss(model)
24
+
25
+ model.fit(
26
+ train_objectives=[(train_dataloader, train_loss)],
27
+ epochs=2, # You can adjust the number of training epochs
28
+ warmup_steps=100,
29
+ optimizer_params={'lr': 1e-4},
30
+ )
31
+
32
+ # Save the fine-tuned model
33
+ model.save('iSeBetter')
model1.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sentence_transformers import SentenceTransformer, util
2
+ import torch
3
+ import pandas as pd
4
+ from sentence_transformers import SentenceTransformer, InputExample, losses
5
+ from torch.utils.data import DataLoader
6
+ from datasets import load_dataset
7
+ import pinecone
8
+
9
+ embedder = SentenceTransformer('all-mpnet-base-v2')
10
+
11
+ dataset = load_dataset("gopikrsmscs/torch-issues")
12
+
13
+ columns = dataset['train'].column_names
14
+ columns_to_keep = ["Title", "Body"]
15
+ columns_to_remove = set(columns_to_keep).symmetric_difference(columns)
16
+ dataset = dataset.remove_columns(columns_to_remove)
17
+
18
+ dataset_dict = dataset['train']
19
+
20
+ examples = []
21
+ for row in dataset_dict:
22
+ title = row['Title']
23
+ #body = row['Body']
24
+ examples.append(title)
25
+
26
+ file_paths = 'saved_corpus.txt'
27
+ torch.save(examples,file_paths)
28
+ corpus_embeddings = embedder.encode(examples, convert_to_tensor=True)
29
+ file_path = 'saved_corpus.pt'
30
+
31
+ # Save the tensor to the file
32
+ torch.save(corpus_embeddings, file_path)
preparedata.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ import csv
3
+
4
+ # Set the GitHub repository and the API endpoint
5
+ repo_url = "https://api.github.com/repos/pytorch/pytorch"
6
+ issues_endpoint = f"{repo_url}/issues"
7
+
8
+ # Initialize an empty list to store all issues
9
+ all_issues = []
10
+
11
+ page = 1
12
+ while True:
13
+ # Set the parameters for the API request
14
+ params = {
15
+ 'state': 'open', # Change to 'closed' for closed issues
16
+ 'per_page': 100, # Number of issues per page (adjust as needed)
17
+ 'page': page
18
+ }
19
+
20
+ # Send the GET request to fetch issues
21
+ response = requests.get(issues_endpoint, params=params)
22
+
23
+ # Check if the request was successful (status code 200)
24
+ if response.status_code == 200:
25
+ issues = response.json()
26
+ if not issues:
27
+ break # No more issues to fetch
28
+ all_issues.extend(issues)
29
+ page += 1
30
+ else:
31
+ print(f"Failed to retrieve issues. Status code: {response.status_code}")
32
+ break
33
+
34
+ # Open a CSV file for writing
35
+ with open('torch_github_issues.csv', mode='w', newline='') as csv_file:
36
+ fieldnames = ['Serial Number', 'Issue Number', 'Title', 'Labels', 'Body','Comments']
37
+ writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
38
+
39
+ # Write the header row
40
+ writer.writeheader()
41
+
42
+ # Write the issues data to the CSV file
43
+ for serial, issue in enumerate(all_issues, start=1):
44
+ writer.writerow({
45
+ 'Serial Number': serial,
46
+ 'Issue Number': issue['number'],
47
+ 'Title': issue['title'],
48
+ 'Labels': ', '.join(label['name'] for label in issue['labels']),
49
+ 'Body': issue['body'],
50
+ 'Comments': issue['comments']
51
+ })
52
+
53
+ print(f"Data written to github_issues.csv")
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ streamlit==0.90.0
2
+ torch==1.10.0
3
+ datasets==1.13.0
4
+ sentence-transformers==2.1.0
5
+ transformers==5.5.1
saved_corpus.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5516ae0fa4699b9424f36d8588a69ec4947181d27eac25ca3519824a737366a4
3
+ size 18433205
saved_corpus.txt ADDED
Binary file (433 kB). View file
 
saved_corpus_list.txt ADDED
Binary file (433 kB). View file
 
train.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import BertTokenizer, BertForSequenceClassification, AdamW
2
+ import torch
3
+
4
+ # Load a pre-trained BERT model and tokenizer
5
+ tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
6
+ model = BertForSequenceClassification.from_pretrained("bert-base-uncased")
7
+
8
+ # Prepare your dataset (replace with your dataset loading code)
9
+ train_texts = ["Text of issue 1", "Text of issue 2", ...]
10
+ labels = [0, 1, ...] # 0 for non-relevant, 1 for relevant
11
+
12
+ # Tokenize and convert your dataset to tensors
13
+ input_ids = tokenizer(train_texts, truncation=True, padding=True, return_tensors="pt")
14
+ labels = torch.tensor(labels)
15
+
16
+ # Set up data loaders
17
+ dataset = torch.utils.data.TensorDataset(input_ids["input_ids"], input_ids["attention_mask"], labels)
18
+ train_loader = torch.utils.data.DataLoader(dataset, batch_size=32)
19
+
20
+ # Define optimizer and loss
21
+ optimizer = AdamW(model.parameters(), lr=1e-5)
22
+ loss_fn = torch.nn.CrossEntropyLoss()
23
+
24
+ # Fine-tune the model
25
+ model.train()
26
+ for epoch in range(3): # Replace with desired number of epochs
27
+ for batch in train_loader:
28
+ input_ids, attention_mask, labels = batch
29
+ optimizer.zero_grad()
30
+ outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
31
+ loss = outputs.loss
32
+ loss.backward()
33
+ optimizer.step()
34
+
35
+ # Save the fine-tuned model
36
+ model.save_pretrained("/path/to/save/model")
37
+
38
+ # You can now use this model for semantic search.