import streamlit as st import pandas as pd import pathlib import whoosh import whoosh.index import whoosh.query import os from datetime import date as Date import re DATA_FOLDER = pathlib.Path(__file__).parent / "Data" RAW_FOLDER = DATA_FOLDER / "Transcription_raw" INDEX_FOLDER = DATA_FOLDER / "Transcription_index" class Searcher: def __init__(self): self.ix = self.make_total_ix() self.df_video_links = self.get_video_links() def make_total_ix(self): ixes_sub = [] index_dir_list = os.listdir(INDEX_FOLDER) for name in index_dir_list: if name.startswith("sub"): ixes_sub.append(whoosh.index.open_dir(INDEX_FOLDER / name)) ix = MultiIndexSearcher(ixes_sub) return ix def search(self, date_start, date_end, **kwargs): titles = self.ix.search(**kwargs) #(index, date, title) contents = [] for title in titles: index = int(title.split("m")[0]) row = self.df_video_links.iloc[index] title = row["title"] date = row["date"] date_datetime = Date(*map(int, date.split("/"))) if not (date_start <= date_datetime <= date_end): continue contents.append((date_datetime, index, date, title)) #order by date_datetime contents.sort() #remove date_datetime contents = [(index, date, title) for _, index, date, title in contents] return contents def get_video_links(self): return pd.read_csv(DATA_FOLDER / "video_links.csv", index_col=0) def get_content(self, index): #正規表現でRAW_FOLDERから「index-(数字).csv」のファイルを取得 folder_list = os.listdir(RAW_FOLDER) pattern = re.compile(r"{}-\d+.csv".format(index)) matched = [name for name in folder_list if pattern.match(name)] matched.sort() best = matched[-1] return pd.read_csv(RAW_FOLDER / best) class MultiIndexSearcher: def __init__(self, ixes): self.ixes = ixes def search(self, **kwargs): titles = [] for ix in self.ixes: with ix.searcher() as sub_searcher: hits = sub_searcher.search(**kwargs, limit = None) for hit in hits: titles.append(hit["title"]) return titles searcher = Searcher() def main(): global searcher st.title("KATO DB") keyword = st.text_input( "検索したいキーワードを入力して、Enterを押してください\n"\ "空欄だと全文書表示します。" ) date_start = st.date_input( "検索したい開始日付を入力してください", Date(2009, 1, 1) ) date_end = st.date_input( "検索したい終了日付を入力してください", Date(2050, 12, 31) ) #make query if keyword == "": query = whoosh.query.Every() else: #AND search keyword_list = keyword.split() query = whoosh.query.And([whoosh.query.Term("content", word) for word in keyword_list]) contents = searcher.search(q = query, date_start=date_start, date_end=date_end) st.write("該当件数:{}件".format(len(contents))) results = pd.DataFrame(contents, columns=["管理番号", "放送日", "動画タイトル"]) st.dataframe(results, hide_index=True) selected_index = st.selectbox("管理番号を選択して書き起こしを表示", results["管理番号"]) if selected_index is not None: df_transcription = searcher.get_content(selected_index) st.dataframe(df_transcription, width=1000) if __name__ == "__main__": main()