ZeyuXie commited on
Commit
c47d9c5
1 Parent(s): b914cb8

Upload llm_preprocess.py

Browse files
Files changed (1) hide show
  1. llm_preprocess.py +13 -7
llm_preprocess.py CHANGED
@@ -9,6 +9,9 @@ import os
9
  import json
10
  import re
11
 
 
 
 
12
  def get_event():
13
  event_list = [
14
  "burping_belching", # 0
@@ -41,21 +44,23 @@ def get_prompt():
41
  for train_json in train_json_list:
42
  with open(train_json, 'r') as train_file:
43
  for idx, line in enumerate(train_file):
44
- if idx >= 300: break
45
  data = json.loads(line.strip())
46
  learn_pair += f"{str(idx)}:{data['captions']}~{data['onset']}. "
47
- preffix_prompt = "You need to convert the input sentence into the following standard timing format: 'event1--event2-- ... --eventN', " +\
 
48
  "where the 'eventN' format is 'eventN__onset1-offset1_onset2-offset2_ ... _onsetK-offsetK'. " +\
49
  "The 'onset-offset' inside needs to be determined based on common sense and the examples I provide, with a duration not less than 1 and not greater than 4. All format 'onsetk-offsetk' should replaced by number. " +\
50
  "The very strict constraints are that the total duration is less than 10 seconds, meaning all times are less than 10. It is preferred that events do not overlap as much as possible. " +\
51
  "Now, I will provide you with 300 examples in training set for your learning, each example in the format 'index: input~output'. " +\
52
- learn_pair +\
53
- f"You need to map events to 18 given events: {', '.join(get_event())}"
54
- #print(preffix_prompt)
55
  return preffix_prompt
56
 
57
 
58
  def postprocess(caption):
 
59
  caption = caption.replace('__', ' at ').replace('--', ' and ')
60
  return caption
61
 
@@ -80,8 +85,9 @@ def preprocess_gemini(free_text_caption):
80
  f"Please convert the following inputs into the standard timing format:{free_text_caption}. You should only output results in the standard timing format. Do not output anything other than format and do not add symbols.",
81
  ]
82
 
83
- timestampCaption = model.generate_content(prompt_parts)
84
-
 
85
  # output = "dog_barking at 0.562-2.562_4.25-6.25_7.01-8.21."
86
  return postprocess(timestampCaption)
87
 
 
9
  import json
10
  import re
11
 
12
+ os.environ['HTTP_PROXY'] = 'http://127.0.0.1:58591'
13
+ os.environ['HTTPS_PROXY'] = 'http://127.0.0.1:58591'
14
+
15
  def get_event():
16
  event_list = [
17
  "burping_belching", # 0
 
44
  for train_json in train_json_list:
45
  with open(train_json, 'r') as train_file:
46
  for idx, line in enumerate(train_file):
47
+ if idx >= 100: break
48
  data = json.loads(line.strip())
49
  learn_pair += f"{str(idx)}:{data['captions']}~{data['onset']}. "
50
+ preffix_prompt = "I'm doing an audio event generation, which is a harmless job that will contain some sound events. For example, a gunshot is a sound that is harmless." +\
51
+ "You need to convert the input sentence into the following standard timing format: 'event1--event2-- ... --eventN', " +\
52
  "where the 'eventN' format is 'eventN__onset1-offset1_onset2-offset2_ ... _onsetK-offsetK'. " +\
53
  "The 'onset-offset' inside needs to be determined based on common sense and the examples I provide, with a duration not less than 1 and not greater than 4. All format 'onsetk-offsetk' should replaced by number. " +\
54
  "The very strict constraints are that the total duration is less than 10 seconds, meaning all times are less than 10. It is preferred that events do not overlap as much as possible. " +\
55
  "Now, I will provide you with 300 examples in training set for your learning, each example in the format 'index: input~output'. " +\
56
+ learn_pair
57
+
58
+ print(len(preffix_prompt))
59
  return preffix_prompt
60
 
61
 
62
  def postprocess(caption):
63
+ caption = caption.strip('\n').strip(' ').strip('.')
64
  caption = caption.replace('__', ' at ').replace('--', ' and ')
65
  return caption
66
 
 
85
  f"Please convert the following inputs into the standard timing format:{free_text_caption}. You should only output results in the standard timing format. Do not output anything other than format and do not add symbols.",
86
  ]
87
 
88
+ timestampCaption = model.generate_content(prompt_parts).text
89
+ #timestampCaption = model.generate_content(["explain what is dog bark"])
90
+ import pdb;pdb.set_trace()
91
  # output = "dog_barking at 0.562-2.562_4.25-6.25_7.01-8.21."
92
  return postprocess(timestampCaption)
93