import logging from typing import List import google.generativeai as genai from google.api_core import exceptions as google_exceptions from app.config import settings from app.core.exceptions import ( OCRServiceUnavailableError, OCRServiceConfigError, OCRUnexpectedError, OCRQuotaExceededError, OCRProcessingError ) logger = logging.getLogger(__name__) gemini_flash_client = None gemini_initialization_error = None try: if settings.GEMINI_API_KEY: genai.configure(api_key=settings.GEMINI_API_KEY) gemini_flash_client = genai.GenerativeModel( model_name=settings.GEMINI_MODEL_NAME, generation_config=genai.types.GenerationConfig( **settings.GEMINI_GENERATION_CONFIG ) ) logger.info(f"Gemini AI client initialized successfully for model '{settings.GEMINI_MODEL_NAME}'.") else: gemini_initialization_error = "GEMINI_API_KEY not configured. Gemini client not initialized." logger.error(gemini_initialization_error) except Exception as e: gemini_initialization_error = f"Failed to initialize Gemini AI client: {e}" logger.exception(gemini_initialization_error) gemini_flash_client = None def get_gemini_client(): """ Returns the initialized Gemini client instance. Raises an exception if initialization failed. """ if gemini_initialization_error: raise OCRServiceConfigError() if gemini_flash_client is None: raise OCRServiceConfigError() return gemini_flash_client OCR_ITEM_EXTRACTION_PROMPT = """ **ROLE & GOAL** You are an expert AI assistant specializing in Optical Character Recognition (OCR) and structured data extraction. Your primary function is to act as a "Shopping List Digitizer." Your goal is to meticulously analyze the provided image of a shopping list, which is likely handwritten, and convert it into a structured, machine-readable JSON format. You must be accurate, infer context where necessary, and handle the inherent ambiguities of handwriting and informal list-making. **INPUT** You will receive a single image (`[Image]`). This image contains a shopping list. It may be: * Neatly written or very messy. * On lined paper, a whiteboard, a napkin, or a dedicated notepad. * Containing doodles, stains, or other visual noise. * Using various formats (bullet points, numbered lists, columns, simple line breaks). * could be in English or in German. **CORE TASK: STEP-BY-STEP ANALYSIS** Follow these steps precisely: 1. **Initial Image Analysis & OCR:** * Perform an advanced OCR scan on the entire image to transcribe all visible text. * Pay close attention to the spatial layout. Identify headings, columns, and line items. Note which text elements appear to be grouped together. 2. **Item Identification & Filtering:** * Differentiate between actual list items and non-item elements. * **INCLUDE:** Items intended for purchase. * **EXCLUDE:** List titles (e.g., "GROCERIES," "Target List"), dates, doodles, unrelated notes, or stray marks. Capture the list title separately if one exists. 3. **Detailed Extraction for Each Item:** For every single item you identify, extract the following attributes. If an attribute is not present, use `null`. * `item_name` (string): The primary name of the product. * **Standardize:** Normalize the name. (e.g., "B. Powder" -> "Baking Powder", "A. Juice" -> "Apple Juice"). * **Contextual Guessing:** If a word is poorly written, use the context of a shopping list to make an educated guess. (e.g., "Ciffee" is almost certainly "Coffee"). * `quantity` (number or string): The amount needed. * If a number is present (e.g., "**2** milks"), extract the number `2`. * If it's a word (e.g., "**a dozen** eggs"), extract the string `"a dozen"`. * If no quantity is specified (e.g., "Bread"), infer a default quantity of `1`. * `unit` (string): The unit of measurement or packaging. * Examples: "kg", "lbs", "liters", "gallons", "box", "can", "bag", "bunch". * Infer where possible (e.g., for "2 Milks," the unit could be inferred as "cartons" or "gallons" depending on regional context, but it's safer to leave it `null` if not explicitly stated). * `notes` (string): Any additional descriptive text. * Examples: "low-sodium," "organic," "brand name (Tide)," "for the cake," "get the ripe ones." * `category` (string): Infer a logical category for the item. * Use common grocery store categories: `Produce`, `Dairy & Eggs`, `Meat & Seafood`, `Pantry`, `Frozen`, `Bakery`, `Beverages`, `Household`, `Personal Care`. * If the list itself has category headings (e.g., a "DAIRY" section), use those first. * `original_text` (string): Provide the exact, unaltered text that your OCR transcribed for this entire line item. This is crucial for verification. * `is_crossed_out` (boolean): Set to `true` if the item is struck through, crossed out, or clearly marked as completed. Otherwise, set to `false`. **HANDLING AMBIGUITIES AND EDGE CASES** * **Illegible Text:** If a line or word is completely unreadable, set `item_name` to `"UNREADABLE"` and place the garbled OCR attempt in the `original_text` field. * **Abbreviations:** Expand common shopping list abbreviations (e.g., "OJ" -> "Orange Juice", "TP" -> "Toilet Paper", "AVOs" -> "Avocados", "G. Beef" -> "Ground Beef"). * **Implicit Items:** If a line is vague like "Snacks for kids," list it as is. Do not invent specific items. * **Multi-item Lines:** If a line contains multiple items (e.g., "Onions, Garlic, Ginger"), split them into separate item objects. **OUTPUT FORMAT** Your final output MUST be a single JSON object with the following structure. Do not include any explanatory text before or after the JSON block. ```json { "list_title": "string or null", "items": [ { "item_name": "string", "quantity": "number or string", "unit": "string or null", "category": "string", "notes": "string or null", "original_text": "string", "is_crossed_out": "boolean" } ], "summary": { "total_items": "integer", "unread_items": "integer", "crossed_out_items": "integer" } } ``` **EXAMPLE WALKTHROUGH** * **IF THE IMAGE SHOWS:** A crumpled sticky note with the title "Stuff for tonight" and the items: * `2x Chicken Breasts` * `~~Baguette~~` (this item is crossed out) * `Salad mix (bag)` * `Tomatos` (misspelled) * `Choc Ice Cream` * **YOUR JSON OUTPUT SHOULD BE:** ```json { "list_title": "Stuff for tonight", "items": [ { "item_name": "Chicken Breasts", "quantity": 2, "unit": null, "category": "Meat & Seafood", "notes": null, "original_text": "2x Chicken Breasts", "is_crossed_out": false }, { "item_name": "Baguette", "quantity": 1, "unit": null, "category": "Bakery", "notes": null, "original_text": "Baguette", "is_crossed_out": true }, { "item_name": "Salad Mix", "quantity": 1, "unit": "bag", "category": "Produce", "notes": null, "original_text": "Salad mix (bag)", "is_crossed_out": false }, { "item_name": "Tomatoes", "quantity": 1, "unit": null, "category": "Produce", "notes": null, "original_text": "Tomatos", "is_crossed_out": false }, { "item_name": "Chocolate Ice Cream", "quantity": 1, "unit": null, "category": "Frozen", "notes": null, "original_text": "Choc Ice Cream", "is_crossed_out": false } ], "summary": { "total_items": 5, "unread_items": 0, "crossed_out_items": 1 } } ``` **FINAL INSTRUCTION** If the image provided is not a shopping list or is completely blank/unintelligible, respond with a JSON object where the `items` array is empty and add a note in the `list_title` field, such as "Image does not appear to be a shopping list." Now, analyze the provided image and generate the JSON output. """ async def extract_items_from_image_gemini(image_bytes: bytes, mime_type: str = "image/jpeg") -> List[str]: """ Uses Gemini Flash to extract shopping list items from image bytes. Args: image_bytes: The image content as bytes. mime_type: The MIME type of the image (e.g., "image/jpeg", "image/png", "image/webp"). Returns: A list of extracted item strings. Raises: OCRServiceConfigError: If the Gemini client is not initialized. OCRQuotaExceededError: If API quota is exceeded. OCRServiceUnavailableError: For general API call errors. OCRProcessingError: If the response is blocked or contains no usable text. OCRUnexpectedError: For any other unexpected errors. """ try: client = get_gemini_client() # Raises OCRServiceConfigError if not initialized image_part = { "mime_type": mime_type, "data": image_bytes } prompt_parts = [ settings.OCR_ITEM_EXTRACTION_PROMPT, image_part ] logger.info("Sending image to Gemini for item extraction...") response = await client.generate_content_async(prompt_parts) if not response.candidates or not response.candidates[0].content.parts: logger.warning("Gemini response blocked or empty.", extra={"response": response}) finish_reason = response.candidates[0].finish_reason if response.candidates else 'UNKNOWN' safety_ratings = response.candidates[0].safety_ratings if response.candidates else 'N/A' if finish_reason == 'SAFETY': raise OCRProcessingError(f"Gemini response blocked due to safety settings. Ratings: {safety_ratings}") else: raise OCRUnexpectedError() raw_text = response.text logger.info("Received raw text from Gemini.") items = [] for line in raw_text.splitlines(): cleaned_line = line.strip() if cleaned_line and len(cleaned_line) > 1: items.append(cleaned_line) logger.info(f"Extracted {len(items)} potential items.") return items except google_exceptions.GoogleAPIError as e: logger.error(f"Gemini API Error: {e}", exc_info=True) if "quota" in str(e).lower(): raise OCRQuotaExceededError() raise OCRServiceUnavailableError() except (OCRServiceConfigError, OCRQuotaExceededError, OCRServiceUnavailableError, OCRProcessingError, OCRUnexpectedError): raise except Exception as e: logger.error(f"Unexpected error during Gemini item extraction: {e}", exc_info=True) raise OCRUnexpectedError() class GeminiOCRService: def __init__(self): try: genai.configure(api_key=settings.GEMINI_API_KEY) self.model = genai.GenerativeModel( model_name=settings.GEMINI_MODEL_NAME, generation_config=genai.types.GenerationConfig( **settings.GEMINI_GENERATION_CONFIG ) ) except Exception as e: logger.error(f"Failed to initialize Gemini client: {e}") raise OCRServiceConfigError() async def extract_items(self, image_data: bytes, mime_type: str = "image/jpeg") -> List[str]: """ Extract shopping list items from an image using Gemini Vision. Args: image_data: The image content as bytes. mime_type: The MIME type of the image (e.g., "image/jpeg", "image/png", "image/webp"). Returns: A list of extracted item strings. Raises: OCRServiceConfigError: If the Gemini client is not initialized. OCRQuotaExceededError: If API quota is exceeded. OCRServiceUnavailableError: For general API call errors. OCRProcessingError: If the response is blocked or contains no usable text. OCRUnexpectedError: For any other unexpected errors. """ try: image_parts = [{"mime_type": mime_type, "data": image_data}] response = await self.model.generate_content_async( contents=[settings.OCR_ITEM_EXTRACTION_PROMPT, *image_parts] ) if not response.text: logger.warning("Gemini response is empty") raise OCRUnexpectedError() if hasattr(response, 'candidates') and response.candidates and hasattr(response.candidates[0], 'finish_reason'): finish_reason = response.candidates[0].finish_reason if finish_reason == 'SAFETY': safety_ratings = response.candidates[0].safety_ratings if hasattr(response.candidates[0], 'safety_ratings') else 'N/A' raise OCRProcessingError(f"Gemini response blocked due to safety settings. Ratings: {safety_ratings}") items = [] for line in response.text.splitlines(): cleaned_line = line.strip() if cleaned_line and len(cleaned_line) > 1 and not cleaned_line.startswith("Example"): items.append(cleaned_line) logger.info(f"Extracted {len(items)} potential items.") return items except google_exceptions.GoogleAPIError as e: logger.error(f"Error during OCR extraction: {e}") if "quota" in str(e).lower(): raise OCRQuotaExceededError() raise OCRServiceUnavailableError() except (OCRServiceConfigError, OCRQuotaExceededError, OCRServiceUnavailableError, OCRProcessingError, OCRUnexpectedError): raise except Exception as e: logger.error(f"Unexpected error during Gemini item extraction: {e}", exc_info=True) raise OCRUnexpectedError()