akhaliq HF Staff commited on
Commit
a2a2a54
·
1 Parent(s): 82a5d4c

update image text extract

Browse files
Files changed (3) hide show
  1. README.md +18 -0
  2. app.py +47 -3
  3. requirements.txt +4 -1
README.md CHANGED
@@ -21,6 +21,7 @@ AnyCoder is an AI-powered code generator that helps you create applications by d
21
 
22
  - **Multi-Model Support**: Choose from various AI models including DeepSeek, ERNIE-4.5-VL, MiniMax, and Qwen
23
  - **Image-to-Code**: Upload UI design images and get corresponding HTML/CSS code (ERNIE-4.5-VL model)
 
24
  - **Live Preview**: See your generated code in action with the built-in sandbox
25
  - **Web Search Integration**: Enable real-time web search to get the latest information and best practices
26
  - **Chat History**: Keep track of your conversations and generated code
@@ -75,6 +76,23 @@ The web search feature uses Tavily to provide real-time information when generat
75
 
76
  When enabled, the AI will search the web for the latest information, best practices, and technologies related to your request.
77
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
  ## Available Models
79
 
80
  - **DeepSeek V3**: Advanced code generation model
 
21
 
22
  - **Multi-Model Support**: Choose from various AI models including DeepSeek, ERNIE-4.5-VL, MiniMax, and Qwen
23
  - **Image-to-Code**: Upload UI design images and get corresponding HTML/CSS code (ERNIE-4.5-VL model)
24
+ - **Image Text Extraction**: Upload images and extract text using OCR for processing
25
  - **Live Preview**: See your generated code in action with the built-in sandbox
26
  - **Web Search Integration**: Enable real-time web search to get the latest information and best practices
27
  - **Chat History**: Keep track of your conversations and generated code
 
76
 
77
  When enabled, the AI will search the web for the latest information, best practices, and technologies related to your request.
78
 
79
+ ## Image Text Extraction
80
+
81
+ The application supports extracting text from images using OCR (Optical Character Recognition). This feature allows you to:
82
+
83
+ 1. Upload image files (JPG, PNG, BMP, TIFF, GIF, WebP) through the file input
84
+ 2. Automatically extract text from the images using Tesseract OCR
85
+ 3. Include the extracted text in your prompts for code generation
86
+
87
+ ### Setting up OCR
88
+
89
+ To use the image text extraction feature, you need to install Tesseract OCR on your system. See `install_tesseract.md` for detailed installation instructions.
90
+
91
+ **Example usage:**
92
+ - Upload an image containing text (like a screenshot, document, or handwritten notes)
93
+ - The application will extract the text and include it in your prompt
94
+ - You can then ask the AI to process, summarize, or work with the extracted text
95
+
96
  ## Available Models
97
 
98
  - **DeepSeek V3**: Advanced code generation model
app.py CHANGED
@@ -6,6 +6,10 @@ import base64
6
  import mimetypes
7
  import PyPDF2
8
  import docx
 
 
 
 
9
 
10
  import gradio as gr
11
  from huggingface_hub import InferenceClient
@@ -116,6 +120,10 @@ DEMO_LIST = [
116
  {
117
  "title": "UI from Image",
118
  "description": "Upload an image of a UI design and I'll generate the HTML/CSS code for it"
 
 
 
 
119
  }
120
  ]
121
 
@@ -372,6 +380,38 @@ def demo_card_click(e: gr.EventData):
372
  # Return the first demo description as fallback
373
  return DEMO_LIST[0]['description']
374
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
375
  def extract_text_from_file(file_path):
376
  if not file_path:
377
  return ""
@@ -391,6 +431,8 @@ def extract_text_from_file(file_path):
391
  elif ext == ".docx":
392
  doc = docx.Document(file_path)
393
  return "\n".join([para.text for para in doc.paragraphs])
 
 
394
  else:
395
  return ""
396
  except Exception as e:
@@ -471,7 +513,7 @@ with gr.Blocks(theme=gr.themes.Base(), title="AnyCoder - AI Code Generator") as
471
  gr.Markdown("# AnyCoder\nAI-Powered Code Generator")
472
  gr.Markdown("""Describe your app or UI in plain English. Optionally upload a UI image (for ERNIE model). Click Generate to get code and preview.""")
473
  gr.Markdown("**Tip:** For best search results about people or entities, include details like profession, company, or location. Example: 'John Smith software engineer at Google.'")
474
- gr.Markdown("**Tip:** You can attach a file (PDF, TXT, DOCX, CSV, MD) to use as reference for your prompt, e.g. 'Summarize this PDF.'")
475
  input = gr.Textbox(
476
  label="Describe your application",
477
  placeholder="e.g., Create a todo app with add, delete, and mark as complete functionality",
@@ -482,8 +524,8 @@ with gr.Blocks(theme=gr.themes.Base(), title="AnyCoder - AI Code Generator") as
482
  visible=False
483
  )
484
  file_input = gr.File(
485
- label="Attach a file (PDF, TXT, DOCX, CSV, MD)",
486
- file_types=[".pdf", ".txt", ".md", ".csv", ".docx"],
487
  visible=True
488
  )
489
  with gr.Row():
@@ -503,6 +545,8 @@ with gr.Blocks(theme=gr.themes.Base(), title="AnyCoder - AI Code Generator") as
503
  else:
504
  gr.Markdown("✅ **Web Search Available**: Toggle above to enable real-time search")
505
 
 
 
506
  gr.Markdown("### Quick Examples")
507
  for i, demo_item in enumerate(DEMO_LIST[:5]):
508
  demo_card = gr.Button(
 
6
  import mimetypes
7
  import PyPDF2
8
  import docx
9
+ import cv2
10
+ import numpy as np
11
+ from PIL import Image
12
+ import pytesseract
13
 
14
  import gradio as gr
15
  from huggingface_hub import InferenceClient
 
120
  {
121
  "title": "UI from Image",
122
  "description": "Upload an image of a UI design and I'll generate the HTML/CSS code for it"
123
+ },
124
+ {
125
+ "title": "Extract Text from Image",
126
+ "description": "Upload an image containing text and I'll extract and process the text content"
127
  }
128
  ]
129
 
 
380
  # Return the first demo description as fallback
381
  return DEMO_LIST[0]['description']
382
 
383
+ def extract_text_from_image(image_path):
384
+ """Extract text from image using OCR"""
385
+ try:
386
+ # Check if tesseract is available
387
+ try:
388
+ pytesseract.get_tesseract_version()
389
+ except Exception:
390
+ return "Error: Tesseract OCR is not installed. Please install Tesseract to extract text from images. See install_tesseract.md for instructions."
391
+
392
+ # Read image using OpenCV
393
+ image = cv2.imread(image_path)
394
+ if image is None:
395
+ return "Error: Could not read image file"
396
+
397
+ # Convert to RGB (OpenCV uses BGR)
398
+ image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
399
+
400
+ # Preprocess image for better OCR results
401
+ # Convert to grayscale
402
+ gray = cv2.cvtColor(image_rgb, cv2.COLOR_RGB2GRAY)
403
+
404
+ # Apply thresholding to get binary image
405
+ _, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
406
+
407
+ # Extract text using pytesseract
408
+ text = pytesseract.image_to_string(binary, config='--psm 6')
409
+
410
+ return text.strip() if text.strip() else "No text found in image"
411
+
412
+ except Exception as e:
413
+ return f"Error extracting text from image: {e}"
414
+
415
  def extract_text_from_file(file_path):
416
  if not file_path:
417
  return ""
 
431
  elif ext == ".docx":
432
  doc = docx.Document(file_path)
433
  return "\n".join([para.text for para in doc.paragraphs])
434
+ elif ext.lower() in [".jpg", ".jpeg", ".png", ".bmp", ".tiff", ".tif", ".gif", ".webp"]:
435
+ return extract_text_from_image(file_path)
436
  else:
437
  return ""
438
  except Exception as e:
 
513
  gr.Markdown("# AnyCoder\nAI-Powered Code Generator")
514
  gr.Markdown("""Describe your app or UI in plain English. Optionally upload a UI image (for ERNIE model). Click Generate to get code and preview.""")
515
  gr.Markdown("**Tip:** For best search results about people or entities, include details like profession, company, or location. Example: 'John Smith software engineer at Google.'")
516
+ gr.Markdown("**Tip:** You can attach a file (PDF, TXT, DOCX, CSV, MD, Images) to use as reference for your prompt, e.g. 'Summarize this PDF' or 'Extract text from this image'.")
517
  input = gr.Textbox(
518
  label="Describe your application",
519
  placeholder="e.g., Create a todo app with add, delete, and mark as complete functionality",
 
524
  visible=False
525
  )
526
  file_input = gr.File(
527
+ label="Attach a file (PDF, TXT, DOCX, CSV, MD, Images)",
528
+ file_types=[".pdf", ".txt", ".md", ".csv", ".docx", ".jpg", ".jpeg", ".png", ".bmp", ".tiff", ".tif", ".gif", ".webp"],
529
  visible=True
530
  )
531
  with gr.Row():
 
545
  else:
546
  gr.Markdown("✅ **Web Search Available**: Toggle above to enable real-time search")
547
 
548
+ gr.Markdown("📷 **Image Text Extraction**: Upload images to extract text using OCR (requires Tesseract installation)")
549
+
550
  gr.Markdown("### Quick Examples")
551
  for i, demo_item in enumerate(DEMO_LIST[:5]):
552
  demo_card = gr.Button(
requirements.txt CHANGED
@@ -2,4 +2,7 @@ git+https://github.com/huggingface/huggingface_hub.git
2
  gradio[oauth]
3
  tavily-python
4
  PyPDF2
5
- python-docx
 
 
 
 
2
  gradio[oauth]
3
  tavily-python
4
  PyPDF2
5
+ python-docx
6
+ pytesseract
7
+ Pillow
8
+ opencv-python