mitlist/be/app/config.py

# app/config.py
import os
from pydantic_settings import BaseSettings
from dotenv import load_dotenv
import logging
import secrets
from typing import List

load_dotenv()
logger = logging.getLogger(__name__)

class Settings(BaseSettings):
    DATABASE_URL: str | None = None
    GEMINI_API_KEY: str | None = None
    SENTRY_DSN: str | None = None  # Sentry DSN for error tracking

    # --- Environment Settings ---
    ENVIRONMENT: str = "development"  # development, staging, production

    # --- JWT Settings --- (SECRET_KEY is used by FastAPI-Users)
    SECRET_KEY: str  # Must be set via environment variable
    TOKEN_TYPE: str = "bearer"  # Default token type for JWT authentication
    # FastAPI-Users handles JWT algorithm internally

    # --- OCR Settings ---
    MAX_FILE_SIZE_MB: int = 10  # Maximum allowed file size for OCR processing
    ALLOWED_IMAGE_TYPES: list[str] = ["image/jpeg", "image/png", "image/webp"]  # Supported image formats
    OCR_ITEM_EXTRACTION_PROMPT: str = """
**ROLE & GOAL**

You are an expert AI assistant specializing in Optical Character Recognition (OCR) and structured data extraction. Your primary function is to act as a "Shopping List Digitizer."

Your goal is to meticulously analyze the provided image of a shopping list, which is likely handwritten, and convert it into a structured, machine-readable JSON format. You must be accurate, infer context where necessary, and handle the inherent ambiguities of handwriting and informal list-making.

**INPUT**

You will receive a single image (`[Image]`). This image contains a shopping list. It may be:
*   Neatly written or very messy.
*   On lined paper, a whiteboard, a napkin, or a dedicated notepad.
*   Containing doodles, stains, or other visual noise.
*   Using various formats (bullet points, numbered lists, columns, simple line breaks).
*   could be in English or in German.

**CORE TASK: STEP-BY-STEP ANALYSIS**

Follow these steps precisely:

1.  **Initial Image Analysis & OCR:**
    *   Perform an advanced OCR scan on the entire image to transcribe all visible text.
    *   Pay close attention to the spatial layout. Identify headings, columns, and line items. Note which text elements appear to be grouped together.

2.  **Item Identification & Filtering:**
    *   Differentiate between actual list items and non-item elements.
    *   **INCLUDE:** Items intended for purchase.
    *   **EXCLUDE:** List titles (e.g., "GROCERIES," "Target List"), dates, doodles, unrelated notes, or stray marks. Capture the list title separately if one exists.

3.  **Detailed Extraction for Each Item:**
    For every single item you identify, extract the following attributes. If an attribute is not present, use `null`.

    *   `item_name` (string): The primary name of the product.
        *   **Standardize:** Normalize the name. (e.g., "B. Powder" -> "Baking Powder", "A. Juice" -> "Apple Juice").
        *   **Contextual Guessing:** If a word is poorly written, use the context of a shopping list to make an educated guess. (e.g., "Ciffee" is almost certainly "Coffee").

    *   `quantity` (number or string): The amount needed.
        *   If a number is present (e.g., "**2** milks"), extract the number `2`.
        *   If it's a word (e.g., "**a dozen** eggs"), extract the string `"a dozen"`.
        *   If no quantity is specified (e.g., "Bread"), infer a default quantity of `1`.

    *   `unit` (string): The unit of measurement or packaging.
        *   Examples: "kg", "lbs", "liters", "gallons", "box", "can", "bag", "bunch".
        *   Infer where possible (e.g., for "2 Milks," the unit could be inferred as "cartons" or "gallons" depending on regional context, but it's safer to leave it `null` if not explicitly stated).

    *   `notes` (string): Any additional descriptive text.
        *   Examples: "low-sodium," "organic," "brand name (Tide)," "for the cake," "get the ripe ones."

    *   `category` (string): Infer a logical category for the item.
        *   Use common grocery store categories: `Produce`, `Dairy & Eggs`, `Meat & Seafood`, `Pantry`, `Frozen`, `Bakery`, `Beverages`, `Household`, `Personal Care`.
        *   If the list itself has category headings (e.g., a "DAIRY" section), use those first.

    *   `original_text` (string): Provide the exact, unaltered text that your OCR transcribed for this entire line item. This is crucial for verification.

    *   `is_crossed_out` (boolean): Set to `true` if the item is struck through, crossed out, or clearly marked as completed. Otherwise, set to `false`.

**HANDLING AMBIGUITIES AND EDGE CASES**

*   **Illegible Text:** If a line or word is completely unreadable, set `item_name` to `"UNREADABLE"` and place the garbled OCR attempt in the `original_text` field.
*   **Abbreviations:** Expand common shopping list abbreviations (e.g., "OJ" -> "Orange Juice", "TP" -> "Toilet Paper", "AVOs" -> "Avocados", "G. Beef" -> "Ground Beef").
*   **Implicit Items:** If a line is vague like "Snacks for kids," list it as is. Do not invent specific items.
*   **Multi-item Lines:** If a line contains multiple items (e.g., "Onions, Garlic, Ginger"), split them into separate item objects.

**OUTPUT FORMAT**

Your final output MUST be a single JSON object with the following structure. Do not include any explanatory text before or after the JSON block.

```json
{
  "list_title": "string or null",
  "items": [
    {
      "item_name": "string",
      "quantity": "number or string",
      "unit": "string or null",
      "category": "string",
      "notes": "string or null",
      "original_text": "string",
      "is_crossed_out": "boolean"
    }
  ],
  "summary": {
    "total_items": "integer",
    "unread_items": "integer",
    "crossed_out_items": "integer"
  }
}
```

**EXAMPLE WALKTHROUGH**

*   **IF THE IMAGE SHOWS:** A crumpled sticky note with the title "Stuff for tonight" and the items:
    *   `2x Chicken Breasts`
    *   `~~Baguette~~` (this item is crossed out)
    *   `Salad mix (bag)`
    *   `Tomatos` (misspelled)
    *   `Choc Ice Cream`

*   **YOUR JSON OUTPUT SHOULD BE:**

```json
{
  "list_title": "Stuff for tonight",
  "items": [
    {
      "item_name": "Chicken Breasts",
      "quantity": 2,
      "unit": null,
      "category": "Meat & Seafood",
      "notes": null,
      "original_text": "2x Chicken Breasts",
      "is_crossed_out": false
    },
    {
      "item_name": "Baguette",
      "quantity": 1,
      "unit": null,
      "category": "Bakery",
      "notes": null,
      "original_text": "Baguette",
      "is_crossed_out": true
    },
    {
      "item_name": "Salad Mix",
      "quantity": 1,
      "unit": "bag",
      "category": "Produce",
      "notes": null,
      "original_text": "Salad mix (bag)",
      "is_crossed_out": false
    },
    {
      "item_name": "Tomatoes",
      "quantity": 1,
      "unit": null,
      "category": "Produce",
      "notes": null,
      "original_text": "Tomatos",
      "is_crossed_out": false
    },
    {
      "item_name": "Chocolate Ice Cream",
      "quantity": 1,
      "unit": null,
      "category": "Frozen",
      "notes": null,
      "original_text": "Choc Ice Cream",
      "is_crossed_out": false
    }
  ],
  "summary": {
    "total_items": 5,
    "unread_items": 0,
    "crossed_out_items": 1
  }
}
```

**FINAL INSTRUCTION**

If the image provided is not a shopping list or is completely blank/unintelligible, respond with a JSON object where the `items` array is empty and add a note in the `list_title` field, such as "Image does not appear to be a shopping list."

Now, analyze the provided image and generate the JSON output.
"""
    # --- OCR Error Messages ---
    OCR_SERVICE_UNAVAILABLE: str = "OCR service is currently unavailable. Please try again later."
    OCR_SERVICE_CONFIG_ERROR: str = "OCR service configuration error. Please contact support."
    OCR_UNEXPECTED_ERROR: str = "An unexpected error occurred during OCR processing."
    OCR_QUOTA_EXCEEDED: str = "OCR service quota exceeded. Please try again later."
    OCR_INVALID_FILE_TYPE: str = "Invalid file type. Supported types: {types}"
    OCR_FILE_TOO_LARGE: str = "File too large. Maximum size: {size}MB"
    OCR_PROCESSING_ERROR: str = "Error processing image: {detail}"

    # --- Gemini AI Settings ---
    GEMINI_MODEL_NAME: str = "gemini-2.5-flash-preview-05-20"  # The model to use for OCR
    GEMINI_SAFETY_SETTINGS: dict = {
        "HARM_CATEGORY_HATE_SPEECH": "BLOCK_MEDIUM_AND_ABOVE",
        "HARM_CATEGORY_DANGEROUS_CONTENT": "BLOCK_MEDIUM_AND_ABOVE",
        "HARM_CATEGORY_HARASSMENT": "BLOCK_MEDIUM_AND_ABOVE",
        "HARM_CATEGORY_SEXUALLY_EXPLICIT": "BLOCK_MEDIUM_AND_ABOVE",
    }
    GEMINI_GENERATION_CONFIG: dict = {
        "candidate_count": 1,
        "max_output_tokens": 2048,
        "temperature": 0.9,
        "top_p": 1,
        "top_k": 1
    }

    # --- API Settings ---
    API_PREFIX: str = "/api"  # Base path for all API endpoints
    API_OPENAPI_URL: str = "/api/openapi.json"
    API_DOCS_URL: str = "/api/docs"
    API_REDOC_URL: str = "/api/redoc"

    # CORS Origins - environment dependent
    CORS_ORIGINS: str = "http://localhost:5173,http://localhost:5174,http://localhost:8000,http://127.0.0.1:5173,http://127.0.0.1:5174,http://127.0.0.1:8000"
    FRONTEND_URL: str = "http://localhost:5173"  # URL for the frontend application

    # --- API Metadata ---
    API_TITLE: str = "Shared Lists API"
    API_DESCRIPTION: str = "API for managing shared shopping lists, OCR, and cost splitting."
    API_VERSION: str = "0.1.0"
    ROOT_MESSAGE: str = "Welcome to the Shared Lists API! Docs available at /api/docs"

    # --- Logging Settings ---
    LOG_LEVEL: str = "WARNING"
    LOG_FORMAT: str = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"

    # --- Health Check Settings ---
    HEALTH_STATUS_OK: str = "ok"
    HEALTH_STATUS_ERROR: str = "error"

    # --- HTTP Status Messages ---
    HTTP_400_DETAIL: str = "Bad Request"
    HTTP_401_DETAIL: str = "Unauthorized"
    HTTP_403_DETAIL: str = "Forbidden"
    HTTP_404_DETAIL: str = "Not Found"
    HTTP_422_DETAIL: str = "Unprocessable Entity"
    HTTP_429_DETAIL: str = "Too Many Requests"
    HTTP_500_DETAIL: str = "Internal Server Error"
    HTTP_503_DETAIL: str = "Service Unavailable"

    # --- Database Error Messages ---
    DB_CONNECTION_ERROR: str = "Database connection error"
    DB_INTEGRITY_ERROR: str = "Database integrity error"
    DB_TRANSACTION_ERROR: str = "Database transaction error"
    DB_QUERY_ERROR: str = "Database query error"

    # --- Auth Error Messages ---
    AUTH_INVALID_CREDENTIALS: str = "Invalid username or password"
    AUTH_NOT_AUTHENTICATED: str = "Not authenticated"
    AUTH_JWT_ERROR: str = "JWT token error: {error}"
    AUTH_JWT_UNEXPECTED_ERROR: str = "Unexpected JWT error: {error}"
    AUTH_HEADER_NAME: str = "WWW-Authenticate"
    AUTH_HEADER_PREFIX: str = "Bearer"

    # OAuth Settings
    # IMPORTANT: For Google OAuth to work, you MUST set the following environment variables
    # (e.g., in your .env file):
    # GOOGLE_CLIENT_ID: Your Google Cloud project's OAuth 2.0 Client ID
    # GOOGLE_CLIENT_SECRET: Your Google Cloud project's OAuth 2.0 Client Secret
    # Ensure the GOOGLE_REDIRECT_URI below matches the one configured in your Google Cloud Console.
    GOOGLE_CLIENT_ID: str = ""
    GOOGLE_CLIENT_SECRET: str = ""
    GOOGLE_REDIRECT_URI: str = "https://mitlistbe.mohamad.dev/api/v1/auth/google/callback"

    APPLE_CLIENT_ID: str = ""
    APPLE_TEAM_ID: str = ""
    APPLE_KEY_ID: str = ""
    APPLE_PRIVATE_KEY: str = ""
    APPLE_REDIRECT_URI: str = "https://mitlistbe.mohamad.dev/api/v1/auth/apple/callback"

    # Session Settings
    # Session secret is required; fail fast if not provided via environment.
    SESSION_SECRET_KEY: str | None = None  # Must be set via env in production; fallback generated in dev/test
    # Shorter token lifetime to reduce risk if a token is leaked.
    ACCESS_TOKEN_EXPIRE_MINUTES: int = 60

    # Redis Settings
    REDIS_URL: str = "redis://localhost:6379"
    REDIS_PASSWORD: str = ""

    class Config:
        env_file = ".env"
        env_file_encoding = 'utf-8'
        extra = "ignore"

    @property
    def cors_origins_list(self) -> List[str]:
        """Convert CORS_ORIGINS string to list"""
        return [origin.strip() for origin in self.CORS_ORIGINS.split(",")]

    @property
    def is_production(self) -> bool:
        """Check if running in production environment"""
        return self.ENVIRONMENT.lower() == "production"

    @property
    def is_development(self) -> bool:
        """Check if running in development environment"""
        return self.ENVIRONMENT.lower() == "development"

    @property
    def docs_url(self) -> str | None:
        """Return docs URL only in development"""
        return self.API_DOCS_URL if self.is_development else None

    @property
    def redoc_url(self) -> str | None:
        """Return redoc URL only in development"""
        return self.API_REDOC_URL if self.is_development else None

    @property
    def openapi_url(self) -> str | None:
        """Return OpenAPI URL only in development"""
        return self.API_OPENAPI_URL if self.is_development else None

settings = Settings()

# Validation for critical settings
if settings.DATABASE_URL is None:
    raise ValueError("DATABASE_URL environment variable must be set.")

# Dynamically generate a session secret in non-production environments to
# maintain backwards-compatibility with local test setups while still failing
# hard in production if a proper secret is missing.
if not settings.SESSION_SECRET_KEY:
    if settings.is_production:
        raise ValueError("SESSION_SECRET_KEY environment variable must be set in production")
    else:
        import secrets as _secrets
        generated_secret = _secrets.token_urlsafe(32)
        object.__setattr__(settings, "SESSION_SECRET_KEY", generated_secret)
        logger.warning("SESSION_SECRET_KEY not provided; generated a temporary secret for development use.")

# Enforce secure secret key
if not settings.SECRET_KEY:
    raise ValueError("SECRET_KEY environment variable must be set. Generate a secure key using: openssl rand -hex 32")

# Validate secret key strength
if len(settings.SECRET_KEY) < 32:
    raise ValueError("SECRET_KEY must be at least 32 characters long for security")

# Production-specific validations
if settings.is_production:
    if not settings.SENTRY_DSN:
        logger.warning("SENTRY_DSN not set in production environment. Error tracking will be unavailable.")

if settings.GEMINI_API_KEY is None:
    logger.error("CRITICAL: GEMINI_API_KEY environment variable not set. Gemini features will be unavailable.")
else:
    # Optional: Log partial key for confirmation (avoid logging full key)
    logger.info(f"GEMINI_API_KEY loaded (starts with: {settings.GEMINI_API_KEY[:4]}...).")

# Log environment information
logger.info(f"Application starting in {settings.ENVIRONMENT} environment")
if settings.is_production:
    logger.info("Production mode: API documentation disabled")
else:
    logger.info(f"Development mode: API documentation available at {settings.API_DOCS_URL}")