jclian91 commited on
Commit
072db73
·
verified ·
1 Parent(s): 6ade0b2

update app.py, add log

Browse files
Files changed (1) hide show
  1. app.py +3 -0
app.py CHANGED
@@ -1,5 +1,6 @@
1
  import gradio as gr
2
  import numpy as np
 
3
  from transformers import AutoModel, AutoTokenizer
4
 
5
  # load model and tokenizer
@@ -10,6 +11,7 @@ model = AutoModel.from_pretrained('jinaai/jina-embeddings-v2-base-zh', trust_rem
10
  def chunk_by_sentences(input_text: str, tokenizer: callable, separator: str):
11
  inputs = tokenizer(input_text, return_tensors='pt', return_offsets_mapping=True)
12
  punctuation_mark_id = tokenizer.convert_tokens_to_ids(separator)
 
13
  print(f"separator: {separator}, punctuation_mark_id: {punctuation_mark_id}")
14
  sep_id = tokenizer.eos_token_id
15
  token_offsets = inputs['offset_mapping'][0]
@@ -57,6 +59,7 @@ def late_chunking(model_output, span_annotation, max_length=None):
57
 
58
 
59
  def embedding_retriever(query_input, text_input, separator):
 
60
  chunks, span_annotations = chunk_by_sentences(text_input, tokenizer, separator)
61
  print(f"chunks: ", chunks)
62
  inputs = tokenizer(text_input, return_tensors='pt', max_length=4096, truncation=True)
 
1
  import gradio as gr
2
  import numpy as np
3
+ from datetime import datetime
4
  from transformers import AutoModel, AutoTokenizer
5
 
6
  # load model and tokenizer
 
11
  def chunk_by_sentences(input_text: str, tokenizer: callable, separator: str):
12
  inputs = tokenizer(input_text, return_tensors='pt', return_offsets_mapping=True)
13
  punctuation_mark_id = tokenizer.convert_tokens_to_ids(separator)
14
+ print("time: ", datetime.now())
15
  print(f"separator: {separator}, punctuation_mark_id: {punctuation_mark_id}")
16
  sep_id = tokenizer.eos_token_id
17
  token_offsets = inputs['offset_mapping'][0]
 
59
 
60
 
61
  def embedding_retriever(query_input, text_input, separator):
62
+ print(f"query: {query_input}")
63
  chunks, span_annotations = chunk_by_sentences(text_input, tokenizer, separator)
64
  print(f"chunks: ", chunks)
65
  inputs = tokenizer(text_input, return_tensors='pt', max_length=4096, truncation=True)