File size: 2,763 Bytes
37deedc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import re
import pandas as pd


def is_valid_sentence(text):
    # check if text starts with a capitalized letter and ends with a punctuation mark
    return re.match(r"^[A-Z].*", text) and re.match(r".*[.?!]\s*$", text)


def group_by_sentences(df):
    """Group episode transcript passages by sentences.
    """
    episode_id = None
    guest = None
    title = None
    text = ""
    start = None
    end = None
    new_rows = []

    for i, row in df.iterrows():
        # continue previous sentence that wasn't complete
        if episode_id == row["id"]:
            # append the current text to the previous text and merge timestamps
            text += " " + row["text"]
            end = row["end"]
        else:
            # otherwise, create a new row and reset variables
            episode_id = row["id"]
            guest = row["guest"]
            title = row["title"]
            text = row["text"]
            start = row["start"]
            end = row["end"]

        if is_valid_sentence(text):
            # add new sentence if valid and reset id
            new_rows.append([episode_id, guest, title, text, start, end])
            episode_id = None

    # add the last row to the new_rows list
    # new_rows.append([id, guest, title, text, start, end])

    # create a new dataframe with the new rows
    new_df = pd.DataFrame(new_rows, columns=df.columns)

    return new_df


def group_by_chunks(df, chunk_size=10):
    """Group episode transcript passages by chunks (where each chunk is a series of 'chunk_size' contiguous passages).
    """
    # create an empty DataFrame with the same columns as the input df
    chunked_df = pd.DataFrame(columns=df.columns)

    # group the dataframe by episodes
    grouped_df = df.groupby('id')

    # iterate over the groups
    for id_val, id_df in grouped_df:
        # iterate over the rows in chunks of size chunk_size
        for i in range(0, len(id_df), chunk_size):
            chunk = id_df.iloc[i:i+chunk_size]

            # concatenate the text values and update the start and end values for the current chunk
            text = " ".join(chunk['text'])
            start = chunk['start'].iat[0]
            end = chunk['end'].iat[-1]

            # create a new row for the current chunk with the concatenated text and updated start and end values
            new_row = pd.DataFrame({
                'id': [chunk['id'].iat[0]],
                'guest': [chunk['guest'].iat[0]],
                'title': [chunk['title'].iat[0]],
                'text': [text], 'start': [start],
                'end': [end]
            })

            # add the new row to the chunked_df DataFrame
            chunked_df = pd.concat([chunked_df, new_row], ignore_index=True)

    return chunked_df