File size: 7,106 Bytes
0ca719f
 
 
 
 
c2c01b8
 
0ca719f
 
 
 
 
 
 
 
 
 
 
b051926
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52264b1
 
b051926
52264b1
 
 
b051926
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0ca719f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11cf7c8
0ca719f
 
 
 
 
 
 
 
b051926
 
 
 
 
 
 
 
 
0ca719f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b051926
0ca719f
b051926
 
 
0ca719f
 
 
 
 
 
 
 
 
 
 
 
 
b051926
0ca719f
b051926
 
 
0ca719f
b051926
 
 
 
 
 
 
52264b1
b051926
 
 
 
52264b1
 
b051926
 
 
 
 
 
 
 
 
 
 
52264b1
b051926
 
 
 
 
 
 
 
 
 
 
 
 
0ca719f
 
 
 
 
b051926
0ca719f
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
import torch
from transformers import pipeline
from transformers.pipelines.audio_utils import ffmpeg_read
import gradio as gr

MODEL_NAME = "seiching/whisper-small-seiching"
#MODEL_NAME = "openai/whisper-small"
BATCH_SIZE = 8

device = 0 if torch.cuda.is_available() else "cpu"

pipe = pipeline(
    task="automatic-speech-recognition",
    model=MODEL_NAME,
    chunk_length_s=30,
    device=device,
)

from openai import OpenAI
from concurrent.futures import ThreadPoolExecutor
import tiktoken



def call_openai_api(openaiobj,transcription):

    response = openaiobj.chat.completions.create(
        model="gpt-3.5-turbo",
        temperature=0,
        messages=[
            {
                "role": "system",
                "content": "你是專業的會議紀錄製作員,請根據由語音辨識軟體將會議錄音所轉錄的逐字稿,也請注意逐字稿可能有錯,請以條列式,列出討論事項及結論,討論內容細節請略過,要用比較正式及容易閱讀的寫法,避免口語化"
            },
            {
                "role": "user",
                "content": transcription
            }
        ]
    )
    return response.choices[0].message.content


def split_into_chunks(text, tokens=500):
    encoding = tiktoken.encoding_for_model('gpt-3.5-turbo')
    words = encoding.encode(text)
    chunks = []
    for i in range(0, len(words), tokens):
        chunks.append(' '.join(encoding.decode(words[i:i + tokens])))
    return chunks

def process_chunks(openaikeystr,inputtext):
    # openaiobj = OpenAI(
    # # This is the default and can be omitted

    # api_key=openaikeystr,
    # )
    openaiojb =OpenAI(base_url="http://localhost:1234/v1", api_key="not-needed")
    text = inputtext
    #openaikey.set_key(openaikeystr)
    #print('process_chunk',openaikey.get_key())
    chunks = split_into_chunks(text)
    response=''
    for chunk in chunks:
       response=response+call_openai_api(openaiobj,chunk)
    return response
    # # Processes chunks in parallel
    # with ThreadPoolExecutor() as executor:
    #     responses = list(executor.map(call_openai_api, [openaiobj,chunks]))
    # return responses
import torch
from transformers import pipeline
from transformers.pipelines.audio_utils import ffmpeg_read
import gradio as gr

MODEL_NAME = "seiching/whisper-small-seiching"
BATCH_SIZE = 8
transcribe_text="this is a test"

device = 0 if torch.cuda.is_available() else "cpu"

pipe = pipeline(
    task="automatic-speech-recognition",
    model=MODEL_NAME,
    chunk_length_s=30,
    device=device,
)


# Copied from https://github.com/openai/whisper/blob/c09a7ae299c4c34c5839a76380ae407e7d785914/whisper/utils.py#L50
def format_timestamp(seconds: float, always_include_hours: bool = False, decimal_marker: str = "."):
    if seconds is not None:
        milliseconds = round(seconds * 1000.0)

        hours = milliseconds // 3_600_000
        milliseconds -= hours * 3_600_000

        minutes = milliseconds // 60_000
        milliseconds -= minutes * 60_000

        seconds = milliseconds // 1_000
        milliseconds -= seconds * 1_000

        hours_marker = f"{hours:02d}:" if always_include_hours or hours > 0 else ""
        return f"{hours_marker}{minutes:02d}:{seconds:02d}{decimal_marker}{milliseconds:03d}"
    else:
        # we have a malformed timestamp so just return it as is
        return seconds


def transcribe(file, task, return_timestamps):
    outputs = pipe(file, batch_size=BATCH_SIZE, generate_kwargs={"task": task,"language": "chinese",}, return_timestamps=return_timestamps)
    text = outputs["text"]
    if return_timestamps:
        timestamps = outputs["chunks"]
        timestamps = [
            f"[{format_timestamp(chunk['timestamp'][0])} -> {format_timestamp(chunk['timestamp'][1])}] {chunk['text']}"
            for chunk in timestamps
        ]
        text = "\n".join(str(feature) for feature in timestamps)
    global transcribe_text
    transcribe_text=text
    # with open('asr_resul.txt', 'w') as f:
    #   f.write(text)

    # ainotes=process_chunks(text)
    # with open("ainotes_result.txt", "a") as f:
    #   f.write(ainotes)

    return text


demo = gr.Blocks()

mic_transcribe = gr.Interface(
    fn=transcribe,
    inputs=[
        gr.inputs.Audio(source="microphone", type="filepath", optional=True),
        gr.inputs.Radio(["transcribe", "translate"], label="Task", default="transcribe"),
        gr.inputs.Checkbox(default=False, label="Return timestamps"),
    ],
    outputs="text",
    layout="horizontal",
    theme="huggingface",
    title="會議紀錄小幫手AINotes",
    description=(
        "可由麥克風錄音或上傳語音檔"
        f" 使用這個模型 [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) 先做語音辨識再做會議紀錄摘要"
        " 長度沒有限制"
    ),
    allow_flagging="never",
)
file_transcribe = gr.Interface(
    fn=transcribe,
    inputs=[
        gr.inputs.Audio(source="upload", optional=True, label="Audio file", type="filepath"),
        gr.inputs.Radio(["transcribe", "translate"], label="Task", default="transcribe"),
        gr.inputs.Checkbox(default=False, label="Return timestamps"),
    ],
    outputs="text",
    layout="horizontal",
    theme="huggingface",
    title="會議紀錄小幫手AINotes",
    description=(
        "可由麥克風錄音或上傳語音檔"
        f" 使用這個模型 [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) 先做語音辨識再做會議紀錄摘要"
        " 長度沒有限制"
    ),
    # examples=[
    #     ["./example.flac", "transcribe", False],
    #     ["./example.flac", "transcribe", True],
    # ],
    cache_examples=True,
    allow_flagging="never",
)
def writenotes(apikeystr,inputscript):
  #text=transcribe_text
  #openaikey.set_key(inputkey)
  #openaikey = OpenAIKeyClass(inputkey)
  print('ok')
  if len(inputscript)>10:
    transcribe_text=inputscript
  ainotestext=process_chunks(apikeystr,transcribe_text)
  #ainotestext=""
  # with open('asr_resul.txt', 'w') as f:
  #     #print(transcribe_text)
  #    # f.write(inputkey)
  #     f.write(transcribe_text)
  # with open('ainotes.txt','w') as f:
  #   f.write(ainotestext)
  return ainotestext
ainotes = gr.Interface(
    fn=writenotes,
    inputs=[gr.Textbox(label="OPEN AI API KEY",placeholder="請輸入sk..."),gr.Textbox(label="逐字稿",placeholder="請輸入逐字稿")],
    outputs="text",
    layout="horizontal",
    theme="huggingface",
    title="會議紀錄小幫手AINotes",
    description=(
        "可由麥克風錄音或上傳語音檔"
        f" 使用這個模型 [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) 先做語音辨識再做會議紀錄摘要"
        " 長度沒有限制"
    ),
    # examples=[
    #     ["./example.flac", "transcribe", False],
    #     ["./example.flac", "transcribe", True],
    # ],
    cache_examples=True,
    allow_flagging="never",
)

with demo:
    gr.TabbedInterface([file_transcribe,mic_transcribe,ainotes], ["語音檔辨識","麥克風語音檔辨識","產生會議紀錄" ])

demo.launch(enable_queue=True)