File size: 2,688 Bytes
a4f4f24
 
3bb118d
 
 
 
 
 
a4f4f24
a31f350
a4f4f24
 
 
 
 
 
 
a31f350
a4f4f24
 
 
3bb118d
a31f350
3bb118d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a31f350
 
 
3bb118d
 
 
 
 
 
a31f350
3bb118d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
import requests
from bs4 import BeautifulSoup
from nltk.tokenize import sent_tokenize
import nltk
import re
import streamlit as st
from youtube_transcript_api import YouTubeTranscriptApi
import spacy

@st.cache
def fetch_article_text(url: str):

    r = requests.get(url)
    soup = BeautifulSoup(r.text, "html.parser")
    results = soup.find_all(["h1", "p"])
    text = [result.text for result in results]
    ARTICLE = " ".join(text)
    return re.sub(r'\[\d+\]', '', ARTICLE)

def count_tokens(text: str):
    return len(text.split(" "))

@st.cache
def get_text_from_youtube_url(url: str):

    id = url.split("=")[1]
    try:
        transcript = YouTubeTranscriptApi.get_transcript(id)
    except:
        transcript = YouTubeTranscriptApi.find_transcript(["en"])
    script = ""

    for text in transcript:
        t = text["text"]
        if t != '[Music]':
            script += t.lower() + " "
        
    return add_punctuation(script)

def add_punctuation(text: str):

    # try:
    nlp = spacy.load("en_core_web_sm")
    # except:
    #     import spacy.cli
    #     spacy.cli.download("en_core_web_sm")
    #     nlp = spacy.load("en_core_web_sm")
        
    doc = nlp(text)
    punctuation = [".", ",", ";", ":", "?", "!"]

    sentences = []
    for sentence in doc.sents:

        last_token = sentence[-1]
        if last_token.text in punctuation:
            sentence = sentence[:-1]
        
        last_word = sentence[-1]
        if last_word.pos_ == "NOUN":
            sentence = sentence.text + "."
        elif last_word.pos_ == "VERB":
            sentence = sentence.text + "?"
        else:
            sentence = sentence.text + "."

        sentence = sentence[0].upper() + sentence[1:]
        sentences.append(sentence)
        
    text_with_punctuation = " ".join(sentences)

    return text_with_punctuation


def get_input_chunks(text: str, max_length: int = 500):

    text = re.sub(r'\[\d+\]', '', text)

    try:
        sentences = sent_tokenize(text)
    except:
        nltk.download('punkt') 
        sentences = sent_tokenize(text)

    sentences = [sentence for sentence in sentences if len(sentence.strip()) > 0 and count_tokens(sentence) > 4]

    input_chunks = []
    temp_sentences = ""
    tokens = 0

    for sentence in sentences:
        if tokens + count_tokens(sentence) < max_length:
            temp_sentences += sentence
            tokens += count_tokens(sentence)
        else:
            input_chunks.append(temp_sentences)
            tokens = count_tokens(sentence)
            temp_sentences = sentence
        
    if len(temp_sentences) > 0:
        input_chunks.append(temp_sentences)

    return input_chunks