File size: 6,623 Bytes
ddc5bbd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
from fastapi import Body, Request, HTTPException, status
from fastapi.encoders import jsonable_encoder
import sys
from ..models.calls import UpdateCall, UserCall, UserCaptions
from ..operations.users import *
from utils.text_rank import extract_terms
from openai import OpenAI

from time import sleep
import os
from dotenv import dotenv_values


# Used within calls to create call record in main.py
def create_calls(collection, user: UserCall = Body(...)):
    calls = jsonable_encoder(user)
    new_calls = collection.insert_one(calls)
    created_calls = collection.find_one({"_id": new_calls.inserted_id})

    return created_calls


'''Finding calls based on call id'''
def find_call(collection, call_id: str):
    user_calls = collection.find_one({"call_id": call_id})
    if user_calls is not None:
        return user_calls
    else:
        raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail=f"Call with ID: '{call_id}' not found.")


'''Finding calls based on user id'''
def find_user_calls(collection, user_id: str):
    user_calls = list(collection.find({"$or": [{"caller_id": user_id}, {"callee_id": user_id}]})) # match on caller or callee ID
    if len(user_calls):
        return user_calls
    else:
        return [] # return empty list if no existing calls for TranscriptView frontend component


def update_calls(collection, call_id: str, calls: UpdateCall = Body(...)):
    calls = {k: v for k, v in calls.items() if v is not None}
    print(calls)

    if len(calls) >= 1:
        update_result = collection.update_one({"call_id": call_id}, {"$set": calls})

        if update_result.modified_count == 0:
            raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail=f"Call not updated!")

    if (existing_item := collection.find_one({"call_id": call_id})) is not None:
        return existing_item

    raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail=f"Call not found!")


def update_captions(call_collection, user_collection, call_id: str, captions: UserCaptions = Body(...)):
    captions = {k: v for k, v in captions.items() if v is not None}

    # index user_id from caption object
    userID = captions["author_id"]

    # use user id to get user name
    username = find_name_from_id(user_collection, userID)

    # add user name to captions json/object
    captions["author_username"] = username

    if len(captions) >= 1:
        update_result = call_collection.update_one({"call_id": call_id},
                                              {"$push": {"captions": captions}})

        if update_result.modified_count == 0:
            raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail=f"Captions not updated!")

    if (existing_item := call_collection.find_one({"call_id": call_id})) is not None:
        return existing_item

    raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail=f"Captions not found!")


def delete_calls(collection, call_id: str):
    deleted_calls = collection.delete_one({"call_id": call_id})

    if deleted_calls.deleted_count == 1:
        return f"Call deleted sucessfully!"

    raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail=f"Call not found!")


def get_caption_text(collection, call_id, user_id):
    call_record = find_call((collection), call_id)

    try: # Check if call has any captions first
        caption_records = call_record['captions']
    except KeyError:
        return None

    combined_text = [] 

    for caption_segment in caption_records:
        if caption_segment['author_id'] == user_id:
            combined_text.append(caption_segment['original_text'])
        else:
            combined_text.append(caption_segment['translated_text'])

    return " ".join(combined_text)


# approximate string matching 
def fuzzy_search(collection, user_id, query):

    # drop any existing indexes and create new one
    collection.drop_indexes()
    collection.create_index([('captions.original_text', 'text'), ('captions.tranlated_text', 'text')],
                            name='captions')
    

    pipeline = [
        {
            "$search": {
                "text": {
                    "query": query,
                    "path": {"wildcard": "*"},
                    "fuzzy": {}
                }
            }
        }
    ]

    collection_results = list(collection.aggregate(pipeline))

    # add all users records to output
    records = []

    for doc in collection_results:
        if doc['caller_id'] == user_id or doc['callee_id'] == user_id:
            records.append(doc)

    return records


def summarise(collection, call_id, user_id, target_language):
    # client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))

    config = dotenv_values(".env")
    client = OpenAI(api_key=config["OPENAI_API_KEY"])

    # get caption text using call_id
    caption_text = get_caption_text(collection, call_id, user_id)

    chat_completion = client.chat.completions.create(
        messages=[
            {
                "role": "user",
                "content": f"The following is an extract from a call transcript. Rewrite this as a structured, clear summary in {target_language}. \
                            \n\Call Transcript: \"\"\"\n{caption_text}\n\"\"\"\n"
            }
        ],
        model="gpt-3.5-turbo",
    )
    
    # Gpt-3.5 turbo has 4096 token limit -> request will fail if exceeded
    try: 
        result = chat_completion.choices[0].message.content
    except:
        return None

    # BO - add result to mongodb
    update_result = collection.update_one({"call_id": call_id}, {"$set": {f"summaries.{user_id}": result}})

    if update_result.modified_count == 0:
        raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail=f"Call not updated!")


    # try parse summary and remove any leading summary prefixes
    try:
        return result.split(":")[1].strip()
    except IndexError:
        return result


def term_extraction(collection, call_id, user_id, target_language):

    combined_text = get_caption_text(collection, call_id, user_id)

    if len(combined_text) > 50: # > min_caption_length: -> poor term extraction on short transcripts

        # Extract Key Terms from Concatenated Caption Field
        key_terms = extract_terms(combined_text, target_language, len(combined_text))

        update_result = collection.update_one({"call_id": call_id}, {"$set": {f"key_terms.{user_id}": key_terms}})

    if update_result.modified_count == 0:
        raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail=f"Call not updated!")
    
    return key_terms