julylun commited on
Commit
d7df580
·
1 Parent(s): 2612890
Files changed (2) hide show
  1. app.py +26 -24
  2. requirements.txt +2 -0
app.py CHANGED
@@ -1,28 +1,30 @@
1
- import gradio as gr
2
- from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
3
-
4
- # Load model and tokenizer
5
- model_name = "castorini/monot5-small-msmarco-10k"
6
- tokenizer = AutoTokenizer.from_pretrained(model_name)
7
- model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
8
-
9
- # Define reranking function
10
- def rerank(query, documents):
11
- documents = documents.split("\n") # Split documents by newlines
12
- reranked_results = []
13
-
14
- for doc in documents:
15
- # Combine query and document into a single input
16
- input_text = f"Query: {query} Document: {doc} Relevant:"
17
- inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True)
18
- outputs = model.generate(**inputs)
19
- # Decode the output
20
- relevance = tokenizer.decode(outputs[0], skip_special_tokens=True)
21
- reranked_results.append((doc, relevance))
22
 
23
- # Sort by relevance (assuming higher is better)
24
- reranked_results.sort(key=lambda x: x[1], reverse=True)
25
- return "\n".join([f"{doc} (Relevance: {rel})" for doc, rel in reranked_results])
 
 
 
 
 
 
 
 
26
 
27
  # Create Gradio interface
28
  interface = gr.Interface(
 
1
+ import py_vncorenlp
2
+ from sentence_transformers import CrossEncoder
3
+ py_vncorenlp.download_model(save_dir='/absolute/path/to/vncorenlp')
4
+ rdrsegmenter = py_vncorenlp.VnCoreNLP(annotators=["wseg"], save_dir='/absolute/path/to/vncorenlp')
5
+
6
+ def rerank(query,sentences):
7
+ tokenized_query = rdrsegmenter.word_segment(query)
8
+ tokenized_sentences = [rdrsegmenter.word_segment(sent) for sent in sentences]
9
+
10
+ tokenized_pairs = [[tokenized_query, sent] for sent in tokenized_sentences]
11
+
12
+ MODEL_ID = 'itdainb/PhoRanker'
13
+ MAX_LENGTH = 512
14
+
15
+ model = CrossEncoder(MODEL_ID, max_length=MAX_LENGTH)
 
 
 
 
 
 
16
 
17
+ # For fp16 usage
18
+ model.model.half()
19
+
20
+ scores = model.predict(tokenized_pairs)
21
+
22
+ # 0.982, 0.2444, 0.9253
23
+ 'print(scores)'
24
+ return scores
25
+
26
+
27
+
28
 
29
  # Create Gradio interface
30
  interface = gr.Interface(
requirements.txt CHANGED
@@ -2,3 +2,5 @@ transformers
2
  gradio
3
  torch
4
  tiktoken
 
 
 
2
  gradio
3
  torch
4
  tiktoken
5
+ py_vncorenlp
6
+ sentence-transformers