|
from selenium import webdriver |
|
from selenium.webdriver.common.by import By |
|
from selenium.webdriver.support.ui import WebDriverWait |
|
from selenium.webdriver.support import expected_conditions as EC |
|
import os |
|
import time |
|
import fitz |
|
from base64 import b64decode |
|
from dateutil.relativedelta import relativedelta |
|
from datetime import date |
|
from datetime import datetime as dt |
|
from datetime import timedelta |
|
import streamlit as st |
|
import PyPDF2 |
|
from PyPDF2 import PdfMerger |
|
import glob |
|
|
|
|
|
|
|
st.set_page_config(page_title="Dvar Creator", page_icon="馃搫", layout="wide", initial_sidebar_state="collapsed") |
|
st.title("Dvar Creator") |
|
|
|
options = webdriver.ChromeOptions() |
|
options.add_argument('--headless') |
|
options.add_argument('--no-sandbox') |
|
options.add_argument('--disable-gpu') |
|
options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36") |
|
options.add_experimental_option('prefs', { |
|
"download.default_directory": "~/ccscraper/dvarmalchus", |
|
"download.prompt_for_download": False, |
|
"download.directory_upgrade": True, |
|
"plugins.always_open_pdf_externally": True |
|
}) |
|
driver = webdriver.Chrome(options=options) |
|
|
|
def download_wait(path_to_downloads): |
|
seconds = 0 |
|
dl_wait = True |
|
while dl_wait and seconds < 20: |
|
time.sleep(1) |
|
dl_wait = False |
|
for fname in os.listdir("/home/mendy/ccscraper/dvarmalchus"): |
|
if fname.endswith('.crdownload'): |
|
dl_wait = True |
|
seconds += 1 |
|
return seconds |
|
|
|
def dvarget(session): |
|
driver = webdriver.Chrome(options=options) |
|
driver.get("https://dvarmalchus.org") |
|
for each in ["/html/body/div[1]/section[2]/div[3]/div/div/div[4]/div/div/section/section/div/div/div/div/div/div", |
|
"/html/body/div[1]/section[2]/div[3]/div/div/div[4]/div/div/section/section/div/div/div/div[2]/div/div/a/span/span[2]", |
|
'/html/body/div[1]/section[9]/div/div/div/div[3]/div/div/div/div[1]/div/section/div/div/div/section/div/div/div/div/div/div/a/span/span[2]', |
|
'/html/body/div[1]/section[2]/div[3]/div/div/div[4]/div/div/section/section/div/div/div/div[2]/div/div/a']: |
|
if driver.find_element(By.XPATH, each).text == "诇讛讜专讚转 讛讞讜讘专转 讛砖讘讜注讬转": |
|
print("clicking regular" + each) |
|
driver.find_element(By.XPATH, each).click() |
|
else: |
|
if driver.find_element(By.XPATH, each).text != "诇讛讜专讚转 讛讞讜讘专转 讛砖讘讜注讬转 - 讞讜状诇": |
|
print("skipping " + each) |
|
continue |
|
elif driver.find_element(By.XPATH, each).text == "诇讛讜专讚转 讛讞讜讘专转 讛砖讘讜注讬转 - 讞讜状诇": |
|
print("clicking alternate" + each) |
|
driver.find_element(By.XPATH, each).click() |
|
break |
|
|
|
driver.switch_to.window(driver.window_handles[1]) |
|
|
|
download_wait("~/ccscraper/dvarmalchus") |
|
|
|
|
|
files = os.listdir("/home/mendy/ccscraper/dvarmalchus") |
|
sessionyear = "2023" |
|
for file in files: |
|
if file.endswith(".pdf") and sessionyear not in file: |
|
print("renaming " + file) |
|
os.rename(os.path.join("/home/mendy/ccscraper/dvarmalchus", file), os.path.join("/home/mendy/ccscraper/dvarmalchus", f"dvar{session}.pdf")) |
|
|
|
|
|
driver.quit() |
|
|
|
def chabadget(dor, opt, session): |
|
pdf_options = { |
|
'scale': 0.8, |
|
'margin-top': '0.1in', |
|
'margin-right': '0.1in', |
|
'margin-bottom': '0.1in', |
|
'margin-left': '0.1in', |
|
} |
|
if os.path.exists(f"/home/mendy/ccscraper/dvarmalchus/Chumash{session}.pdf") != True: |
|
merger = PdfMerger() |
|
if 'Chumash' in opt: |
|
for i in dor: |
|
driver = webdriver.Chrome(options=options) |
|
driver.get(f"https://www.chabad.org/dailystudy/torahreading.asp?tdate={i}#lt=he") |
|
wait = WebDriverWait(driver, 10) |
|
element = wait.until(EC.presence_of_element_located((By.ID, "content"))) |
|
pdf = driver.execute_cdp_cmd("Page.printToPDF", pdf_options) |
|
with open(f"temp{session}.pdf", "ab") as f: |
|
f.write(b64decode(pdf["data"])) |
|
f.close() |
|
driver.quit() |
|
merger.append(f"temp{session}.pdf") |
|
|
|
merger.write(f"Chumash{session}.pdf") |
|
merger.close() |
|
os.remove("temp.pdf") |
|
if os.path.exists(f"/home/mendy/ccscraper/dvarmalchus/Tanya{session}.pdf") != True: |
|
merger2 = PdfMerger() |
|
if 'Tanya' in opt: |
|
for i in dor: |
|
driver = webdriver.Chrome(options=options) |
|
driver.get(f"https://www.chabad.org/dailystudy/tanya.asp?date={i}&commentary=false#lt=he") |
|
wait = WebDriverWait(driver, 10) |
|
element = wait.until(EC.presence_of_element_located((By.ID, "content"))) |
|
time.sleep(3) |
|
pdf = driver.execute_cdp_cmd("Page.printToPDF", pdf_options) |
|
with open(f"temp{session}.pdf", "ab") as f: |
|
f.write(b64decode(pdf["data"])) |
|
f.close() |
|
driver.quit() |
|
merger2.append(f"temp{session}.pdf") |
|
|
|
merger2.write(f"Tanya{session}.pdf") |
|
merger2.close() |
|
os.remove(f"temp{session}.pdf") |
|
driver.quit() |
|
|
|
def rambamenglish(dor, session): |
|
pdf_options = { |
|
'scale': 0.48, |
|
'margin-top': '0.1in', |
|
'margin-right': '0.1in', |
|
'margin-bottom': '0.1in', |
|
'margin-left': '0.1in', |
|
} |
|
merger = PdfMerger() |
|
if os.path.exists(f"/home/mendy/ccscraper/dvarmalchus/Rambam{session}.pdf") != True: |
|
for i in dor: |
|
driver = webdriver.Chrome(options=options) |
|
driver.get(f"https://www.chabad.org/dailystudy/rambam.asp?rambamchapters=3&tdate={i}#lt=both") |
|
wait = WebDriverWait(driver, 10) |
|
element = wait.until(EC.presence_of_element_located((By.ID, "content"))) |
|
pdf = driver.execute_cdp_cmd("Page.printToPDF", pdf_options) |
|
with open(f"temp{session}.pdf", "ab") as f: |
|
f.write(b64decode(pdf["data"])) |
|
f.close() |
|
driver.quit() |
|
|
|
merger.append(f"temp{session}.pdf") |
|
|
|
merger.write(f"Rambam{session}.pdf") |
|
merger.close() |
|
os.remove(f"temp{session}.pdf") |
|
|
|
def daytoheb(week, dow): |
|
for i in week: |
|
if i == 'Sunday': |
|
dow.append('讬讜诐 专讗砖讜谉') |
|
elif i == 'Monday': |
|
dow.append('讬讜诐 砖谞讬') |
|
elif i == 'Tuesday': |
|
dow.append('讬讜诐 砖诇讬砖讬') |
|
elif i == 'Wednesday': |
|
dow.append('讬讜诐 专讘讬注讬') |
|
elif i == 'Thursday': |
|
dow.append('讬讜诐 讞诪讬砖讬') |
|
elif i == 'Friday': |
|
dow.append('讬讜诐 砖讬砖讬') |
|
elif i == 'Shabbos': |
|
dow.append('砖讘转 拽讜讚砖') |
|
return dow |
|
|
|
def opttouse(opt, optconv): |
|
for i in opt: |
|
if i == 'Chumash': |
|
optconv.append('讞讜诪砖 讬讜诪讬') |
|
elif i == 'Tanya': |
|
optconv.append('转谞讬讗 讬讜诪讬') |
|
elif i == 'Rambam-Hebrew': |
|
optconv.append('专诪讘"诐 - 砖诇讜砖讛 驻专拽讬诐 诇讬讜诐') |
|
elif i == 'Haftorah': |
|
optconv.append('讞讜诪砖 诇拽专讬讗讛 讘爪讬讘讜专') |
|
elif i == 'Rambam-Bilingual': |
|
optconv.append(i) |
|
return optconv |
|
|
|
def daytorambam(week, dor): |
|
today = date.today() |
|
day_to_n = {'Monday': 0, 'Tuesday': 1, 'Wednesday': 2, 'Thursday': 3, 'Friday': 4, 'Shabbos': 5, 'Sunday': 6} |
|
for i in week: |
|
n = day_to_n[i] |
|
print(n) |
|
linkappend = today + relativedelta(weekday=n) |
|
y, m, d = str(linkappend).split("-") |
|
dor.append(f'{m}%2F{d}%2F{y}') |
|
return dor |
|
|
|
def dynamicmake(dow, optconv, opt, source, session): |
|
output_dir = "/home/mendy/ccscraper/dvarmalchus" |
|
try: |
|
doc = fitz.open(f"/home/mendy/ccscraper/dvarmalchus/dvar{session}.pdf") |
|
toc = doc.get_toc() |
|
except: |
|
pass |
|
doc_out = fitz.open() |
|
'''print(toc)''' |
|
if source == False: |
|
print("Chabad.org") |
|
print(opt) |
|
for option in opt: |
|
if option == 'Chumash': |
|
doc_out.insert_pdf(fitz.open(f"/home/mendy/ccscraper/dvarmalchus/Chumash{session}.pdf")) |
|
elif option == 'Tanya': |
|
doc_out.insert_pdf(fitz.open(f"/home/mendy/ccscraper/dvarmalchus/Tanya{session}.pdf")) |
|
elif option == 'Rambam-Bilingual': |
|
doc.out.insert_pdf(fitz.open(f"/home/mendy/ccscraper/dvarmalchus/Rambam{session}.pdf")) |
|
continue |
|
else: |
|
for q in optconv: |
|
for z in dow: |
|
for i, top_level in enumerate(toc): |
|
if not top_level[2]: |
|
continue |
|
if top_level[1] == q: |
|
for j, sub_level in enumerate(toc[i+1:], start=i+1): |
|
if sub_level[0] != top_level[0] + 1: |
|
break |
|
if z in sub_level[1]: |
|
start_page = sub_level[2] - 1 |
|
if top_level[1] == "讞讜诪砖 讬讜诪讬": |
|
end_page = toc[j+1][2] - 3 |
|
print("Chumash found") |
|
if top_level[1] == "转谞讬讗 讬讜诪讬": |
|
end_page = toc[j+1][2] - 2 |
|
print("Tanya found") |
|
if top_level[1] == '专诪讘"诐 - 砖诇讜砖讛 驻专拽讬诐 诇讬讜诐': |
|
end_page = toc[j+1][2] - 1 |
|
print("Rambam found") |
|
doc_out.insert_pdf(doc, from_page=start_page, to_page=end_page) |
|
continue |
|
|
|
if q == '讞讜诪砖 诇拽专讬讗讛 讘爪讬讘讜专': |
|
for i, item in enumerate(toc): |
|
|
|
if item[1] == '讞讜诪砖 诇拽专讬讗讛 讘爪讬讘讜专': |
|
pdf_file = open(f"/home/mendy/ccscraper/dvarmalchus/dvar{session}.pdf", "rb") |
|
pdf_reader = PyPDF2.PdfReader(pdf_file) |
|
page_num_start = item[2] - 1 |
|
|
|
page_num_end = toc[i+1][2] - 3 |
|
|
|
print("Torah reading found") |
|
for page_num in range(page_num_start, page_num_end): |
|
|
|
page = pdf_reader.pages[page_num] |
|
text = page.extract_text() |
|
|
|
if "讘专讻转 讛驻讟讜专讛" in text or "xtd enk dxhtdd renyl" in text: |
|
doc_out.insert_pdf(doc, from_page=page_num, to_page=page_num_end) |
|
continue |
|
elif q == 'Rambam-Bilingual': |
|
doc_out.insert_pdf(fitz.open(f"/home/mendy/ccscraper/dvarmalchus/Rambam{session}.pdf")) |
|
print("Appended") |
|
continue |
|
|
|
|
|
|
|
|
|
doc_out.save(os.path.join(output_dir, f"output_dynamic{session}.pdf")) |
|
doc_out.close() |
|
|
|
|
|
with st.form(key="dvarform", clear_on_submit=False): |
|
st.title("Printout Creator :book:") |
|
week = st.multiselect('Select the days of the week.', options=['Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Shabbos']) |
|
opt = st.multiselect('Select what materials you want.', options=['Chumash', 'Tanya', 'Rambam-Hebrew', 'Rambam-Bilingual', 'Haftorah']) |
|
source = st.checkbox('Use Dvar Malchus, or get from Chabad.org? If checked, sources from Dvar Malchus are used.', value=True) |
|
submit_button = st.form_submit_button(label="Generate PDF") |
|
|
|
if submit_button: |
|
if id not in st.session_state: |
|
st.session_state['id'] = dt.now() |
|
session = st.session_state.id |
|
weekorder = ['Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Shabbos'] |
|
optorder = ['Chumash', 'Tanya', 'Rambam-Hebrew', 'Rambam-Bilingual', 'Haftorah'] |
|
dow = [] |
|
optconv = [] |
|
dor = [] |
|
week = sorted(week, key=weekorder.index) |
|
opt = sorted(opt, key=optorder.index) |
|
|
|
daytoheb(week, dow) |
|
opttouse(opt, optconv) |
|
print(optconv) |
|
if source == True: |
|
if os.path.exists(f"/home/mendy/ccscraper/dvarmalchus/dvar{session}.pdf") == False: |
|
try: |
|
dvarget(session) |
|
except: |
|
source = False |
|
|
|
if source == False: |
|
daytorambam(week, dor) |
|
chabadget(dor, opt, session) |
|
|
|
if 'Rambam-Bilingual' in opt: |
|
daytorambam(week, dor) |
|
rambamenglish(dor, session) |
|
|
|
dynamicmake(dow, optconv, opt, source, session) |
|
|
|
if os.path.exists(f"/home/mendy/ccscraper/dvarmalchus/output_dynamic{session}.pdf"): |
|
with open(f"/home/mendy/ccscraper/dvarmalchus/output_dynamic{session}.pdf", "rb") as f: |
|
st.download_button(label="Download", data=f, file_name="output_dynamic.pdf", mime="application/pdf") |
|
|
|
|
|
if glob.glob("Rambam*.pdf"): |
|
for file in glob.glob("Rambam*.pdf"): |
|
|
|
timestamp = file.lstrip("Rambam").rstrip(".pdf") |
|
|
|
file_datetime = dt.strptime(timestamp, "%Y-%m-%d %H:%M:%S.%f") |
|
|
|
if dt.now() - file_datetime > timedelta(minutes=1): |
|
if file != f'Rambam{session}.pdf': |
|
os.remove(file) |
|
|
|
if glob.glob("Chumash*.pdf"): |
|
for file in glob.glob("Chumash*.pdf"): |
|
|
|
timestamp = file.lstrip("Chumash").rstrip(".pdf") |
|
|
|
file_datetime = dt.strptime(timestamp, "%Y-%m-%d %H:%M:%S.%f") |
|
|
|
if dt.now() - file_datetime > timedelta(minutes=1): |
|
if file != f'Chumash{session}.pdf': |
|
os.remove(file) |
|
|
|
if glob.glob("Tanya*.pdf"): |
|
for file in glob.glob("Tanya*.pdf"): |
|
|
|
timestamp = file.lstrip("Tanya").rstrip(".pdf") |
|
|
|
file_datetime = dt.strptime(timestamp, "%Y-%m-%d %H:%M:%S.%f") |
|
|
|
if dt.now() - file_datetime > timedelta(minutes=1): |
|
if file != f'Tanya{session}.pdf': |
|
os.remove(file) |
|
|
|
if glob.glob("dvar*.pdf"): |
|
for file in glob.glob("dvar*.pdf"): |
|
|
|
timestamp = file.lstrip("dvar").rstrip(".pdf") |
|
|
|
file_datetime = dt.strptime(timestamp, "%Y-%m-%d %H:%M:%S.%f") |
|
|
|
if dt.now() - file_datetime > timedelta(minutes=1): |
|
if file != f'dvar{session}.pdf': |
|
os.remove(file) |
|
|
|
if glob.glob("output_dynamic*.pdf"): |
|
for file in glob.glob("output_dynamic*.pdf"): |
|
|
|
timestamp = file.lstrip("output_dynamic").rstrip(".pdf") |
|
|
|
file_datetime = dt.strptime(timestamp, "%Y-%m-%d %H:%M:%S.%f") |
|
|
|
if dt.now() - file_datetime > timedelta(minutes=1): |
|
if file != f'output_dynamic{session}.pdf': |
|
os.remove(file) |
|
|
|
|
|
|