Spaces:
Build error
Build error
| import streamlit as st | |
| import PyPDF2 | |
| import openai | |
| from io import BytesIO | |
| import io | |
| from reportlab.pdfgen import canvas | |
| from reportlab.lib.pagesizes import letter, A4 | |
| from reportlab.pdfbase import pdfmetrics | |
| from reportlab.pdfbase.ttfonts import TTFont | |
| from weasyprint import HTML, CSS | |
| from weasyprint.text.fonts import FontConfiguration | |
| import arabic_reshaper | |
| from bidi.algorithm import get_display | |
| import os | |
| import tempfile | |
| # Get API key from Hugging Face secrets | |
| api_key = os.environ.get('OPENAI_API_KEY') | |
| def register_fonts(): | |
| """Register fonts for different languages""" | |
| try: | |
| # Using Noto Nastaliq Urdu for Urdu | |
| pdfmetrics.registerFont(TTFont('NotoNastaliqUrdu', 'NafeesNastaleeqXX.ttf')) | |
| # Using Noto Naskh Arabic for Arabic | |
| pdfmetrics.registerFont(TTFont('NotoNaskhArabic', 'NotoNaskhArabic-Regular.ttf')) | |
| # Using Noto Sans for other languages | |
| pdfmetrics.registerFont(TTFont('NotoSans', 'NotoSans-Regular.ttf')) | |
| except Exception as e: | |
| st.warning(f"Font files not found. Default fonts will be used. Error: {str(e)}") | |
| def extract_text_from_pdf(pdf_file): | |
| """Extract text from uploaded PDF file""" | |
| pdf_reader = PyPDF2.PdfReader(pdf_file) | |
| text = "" | |
| for page in pdf_reader.pages: | |
| text += page.extract_text() | |
| return text | |
| def create_pdf(text, target_language): | |
| if target_language == "Urdu": | |
| font_config = FontConfiguration() | |
| # Process text to handle English and numbers differently | |
| processed_lines = [] | |
| for line in text.split('\n'): | |
| # Split line into Urdu and non-Urdu parts | |
| processed_line = "" | |
| current_text = "" | |
| is_urdu = True | |
| for char in line: | |
| if '\u0600' <= char <= '\u06FF' or char in ['۔', '،']: # Urdu character range | |
| if not is_urdu: | |
| if current_text: | |
| processed_line += f'<span class="latin">{current_text}</span>' | |
| current_text = "" | |
| is_urdu = True | |
| current_text += char | |
| else: | |
| if is_urdu: | |
| if current_text: | |
| processed_line += current_text | |
| current_text = "" | |
| is_urdu = False | |
| current_text += char | |
| if current_text: | |
| if is_urdu: | |
| processed_line += current_text | |
| else: | |
| processed_line += f'<span class="latin">{current_text}</span>' | |
| processed_lines.append(f'<p class="urdu-text">{processed_line}</p>') | |
| processed_text = '\n'.join(processed_lines) | |
| html_content = f""" | |
| <!DOCTYPE html> | |
| <html dir="rtl" lang="ur"> | |
| <head> | |
| <meta charset="UTF-8"> | |
| <style> | |
| @font-face {{ | |
| font-family: 'NotoNastaliqUrdu'; | |
| src: url('fonts/NotoNastaliqUrdu-Regular.ttf') format('truetype'); | |
| font-weight: normal; | |
| font-style: normal; | |
| }} | |
| @page {{ | |
| size: A4; | |
| margin: 3cm 2.5cm; | |
| }} | |
| body {{ | |
| font-family: 'NotoNastaliqUrdu', serif; | |
| font-size: 16pt; | |
| line-height: 3; | |
| margin: 0; | |
| padding: 0; | |
| direction: rtl; | |
| text-align: right; | |
| text-rendering: optimizeLegibility; | |
| -webkit-font-smoothing: antialiased; | |
| }} | |
| .content {{ | |
| width: 100%; | |
| max-width: 18cm; | |
| margin: 0 auto; | |
| }} | |
| .urdu-text {{ | |
| margin: 0 0 2em 0; | |
| padding: 0; | |
| text-align: right; | |
| white-space: pre-wrap; | |
| word-wrap: break-word; | |
| font-feature-settings: "kern", "liga", "calt"; | |
| letter-spacing: 0.02em; | |
| }} | |
| .latin {{ | |
| font-family: Arial, sans-serif; | |
| direction: ltr; | |
| unicode-bidi: embed; | |
| font-size: 14pt; | |
| }} | |
| /* Improve spacing around punctuation */ | |
| .urdu-text::after {{ | |
| content: ""; | |
| display: block; | |
| height: 1.5em; | |
| }} | |
| </style> | |
| </head> | |
| <body> | |
| <div class="content"> | |
| {processed_text} | |
| </div> | |
| </body> | |
| </html> | |
| """ | |
| # Create a temporary HTML file | |
| with tempfile.NamedTemporaryFile(suffix='.html', mode='w', encoding='utf-8', delete=False) as f: | |
| f.write(html_content) | |
| temp_html = f.name | |
| # Convert HTML to PDF using WeasyPrint with improved settings | |
| buffer = BytesIO() | |
| HTML(temp_html).write_pdf( | |
| buffer, | |
| font_config=font_config, | |
| stylesheets=[CSS(string=''' | |
| @page { | |
| size: A4; | |
| margin: 3cm 2.5cm; | |
| @top-right { | |
| content: ""; | |
| margin: 1cm 0; | |
| } | |
| @bottom-center { | |
| content: counter(page); | |
| font-family: Arial, sans-serif; | |
| } | |
| } | |
| ''')] | |
| ) | |
| buffer.seek(0) | |
| # Clean up temporary file | |
| os.unlink(temp_html) | |
| return buffer | |
| else: | |
| # Use ReportLab for other languages | |
| buffer = BytesIO() | |
| c = canvas.Canvas(buffer, pagesize=A4) | |
| width, height = A4 | |
| y = height - 50 | |
| margin = 50 | |
| if target_language == "Arabic": | |
| try: | |
| c.setFont('NotoNaskhArabic', 14) | |
| text = arabic_reshaper.reshape(text) | |
| text = get_display(text) | |
| lines = text.split('\n') | |
| line_height = c._fontsize * 1.5 | |
| for line in lines: | |
| if y < 50: | |
| c.showPage() | |
| y = height - 50 | |
| c.setFont('NotoNaskhArabic', 14) | |
| line_width = c.stringWidth(line, c._fontname, c._fontsize) | |
| x = width - margin - line_width | |
| c.drawString(x, y, line) | |
| y -= line_height | |
| except Exception as e: | |
| st.warning(f"Arabic rendering error: {str(e)}") | |
| c.setFont('Helvetica', 12) | |
| else: | |
| try: | |
| c.setFont('NotoSans', 12) | |
| lines = text.split('\n') | |
| line_height = c._fontsize * 1.5 | |
| for line in lines: | |
| if y < 50: | |
| c.showPage() | |
| y = height - 50 | |
| c.setFont('NotoSans', 12) | |
| c.drawString(margin, y, line) | |
| y -= line_height | |
| except Exception as e: | |
| st.warning(f"Text rendering error: {str(e)}") | |
| c.setFont('Helvetica', 12) | |
| c.save() | |
| buffer.seek(0) | |
| return buffer | |
| def translate_text(text, target_language, api_key): | |
| """Translate text using OpenAI API with improved prompting""" | |
| try: | |
| client = openai.OpenAI(api_key=api_key) | |
| # Enhanced prompt for better translation | |
| system_prompt = f"""You are a professional translator specializing in {target_language}. | |
| Translate the following text to {target_language}, ensuring: | |
| 1. Technical terms are accurately translated | |
| 2. Maintain formal language and proper grammar | |
| 3. Preserve formatting and structure | |
| 4. Keep proper nouns and technical terms like 'AI', 'LLMs', 'Python' in English where appropriate | |
| 5. Use culturally appropriate expressions | |
| 6. For Urdu/Arabic, ensure proper character connections and diacritics | |
| 7. Maintain professional and accurate technical translations | |
| 8. Preserve line breaks and paragraph structure | |
| """ | |
| response = client.chat.completions.create( | |
| model="gpt-3.5-turbo", | |
| messages=[ | |
| {"role": "system", "content": system_prompt}, | |
| {"role": "user", "content": text} | |
| ], | |
| temperature=0.3 | |
| ) | |
| return response.choices[0].message.content | |
| except Exception as e: | |
| return f"Translation error: {str(e)}" | |
| # Set page config | |
| st.set_page_config(page_title="PDF Translator", layout="wide") | |
| # Try to register fonts at startup | |
| register_fonts() | |
| # Main app interface | |
| st.title("PDF Document Translator") | |
| # Add custom CSS for better text display | |
| st.markdown(""" | |
| <style> | |
| .stTextArea textarea { | |
| font-size: 16px !important; | |
| } | |
| </style> | |
| """, unsafe_allow_html=True) | |
| # Language selection | |
| languages = { | |
| "English": "English", | |
| "Urdu": "Urdu", | |
| "Arabic": "Arabic", | |
| "Roman English": "Roman English", | |
| "Roman Urdu": "Roman Urdu", | |
| "Hindi": "Hindi", | |
| "Spanish": "Spanish", | |
| "French": "French" | |
| } | |
| # File uploader | |
| uploaded_file = st.file_uploader("Upload your PDF file", type="pdf") | |
| # API Key input field | |
| api_key_input = st.text_input("Enter OpenAI API Key:", type="password", key="api_key_input") | |
| if api_key_input: | |
| api_key = api_key_input | |
| # Language selector | |
| target_language = st.selectbox( | |
| "Select target language", | |
| options=list(languages.keys()) | |
| ) | |
| # Create two columns for original and translated text | |
| col1, col2 = st.columns(2) | |
| if uploaded_file is not None and api_key: | |
| # Extract text from PDF | |
| with st.spinner("Extracting text from PDF..."): | |
| text = extract_text_from_pdf(uploaded_file) | |
| # Show original text | |
| with col1: | |
| st.subheader("Original Text") | |
| st.text_area("", value=text, height=400, key="original_text") | |
| # Initialize session state for translated text | |
| if 'translated_text' not in st.session_state: | |
| st.session_state.translated_text = None | |
| # Translate button | |
| if st.button("Translate"): | |
| with st.spinner("Translating..."): | |
| translated_text = translate_text(text, languages[target_language], api_key) | |
| st.session_state.translated_text = translated_text | |
| # Show translated text | |
| with col2: | |
| st.subheader(f"Translated Text ({target_language})") | |
| st.text_area("", value=translated_text, height=400, key="translated_text") | |
| # Show download button if translation exists | |
| if st.session_state.translated_text: | |
| # Create PDF button | |
| if st.download_button( | |
| label="Download Translated PDF", | |
| data=create_pdf(st.session_state.translated_text, target_language), | |
| file_name=f"translated_{target_language}.pdf", | |
| mime="application/pdf" | |
| ): | |
| st.success("PDF downloaded successfully!") | |
| elif not api_key: | |
| st.warning("Please enter your OpenAI API key to proceed.") | |
| # Add instructions and notes | |
| st.markdown(""" | |
| ### Instructions: | |
| 1. Enter your OpenAI API key | |
| 2. Upload your PDF file | |
| 3. Select your target language | |
| 4. Click 'Translate' to get your translation | |
| 5. Review the translation | |
| 6. Click 'Download Translated PDF' to save as PDF | |
| """) |