Reference

`document_chat()`

Serves the document chat page using cached content based on the provided content ID.

Retrieves cached data (title and content) for a given contentId passed as a query parameter and renders a chat interface for continued conversation with the document.

Examples:

None

Returns:

Type	Description
`Any`	Rendered HTML page (documentChat.html) with:
`Any`	content_id: The ID of the requested content.
`Any`	container_title_chat: The title of the document.
`Any`	content_chat: The previously generated content or an error message.

Source code in src/main.py

@app.route("/documentChat")
def document_chat() -> Any:
    """Serves the document chat page using cached content based on the provided content ID.

    Retrieves cached data (title and content) for a given contentId passed as a query parameter
    and renders a chat interface for continued conversation with the document.

    Examples:
        None

    Returns:
        Rendered HTML page (documentChat.html) with:
        - content_id: The ID of the requested content.
        - container_title_chat: The title of the document.
        - content_chat: The previously generated content or an error message.

    Raises:
        None
    """

    content_id = request.args.get("contentId")  # Get ID from URL query ?contentId=...
    log.info(f"Langchain chat request for contentId: {content_id}")

    # Retrieve data from cache
    cached_data = cache.get(content_id)
    log.debug(f"Cache lookup for {content_id} returned: {type(cached_data)}")

    if cached_data:
        container_title_chat = cached_data.get("title", "Unknown Title")
        content_chat = cached_data.get("content", "<p>Content not found.</p>")
        chat_history = cached_data.get("chat_history", [])
        log.info(f"Found content for {content_id} in cache.")
        log.info(f"Found {len(chat_history)} messages in history for {content_id}.")

        for message in chat_history:
            if message.get("role") == "model" and message.get("parts"):
                # Convert the raw markdown in 'parts' to HTML
                raw_markdown = message["parts"][0]
                message["parts"][0] = markdown.markdown(raw_markdown)
    else:
        container_title_chat = "Error"
        content_chat = f"<p>Could not find content for ID: {content_id}. Cache might be empty or ID is invalid.</p>"
        chat_history = []
        log.warning(f"Content for {content_id} not found in cache.")

    return render_template(
        "documentChat.html",
        content_id=content_id,
        container_title_chat=container_title_chat,
        content_chat=content_chat,
        chat_history=chat_history,
    )

`handle_chat_message(data)`

Handles incoming chat messages and generates a streamed response using cached document context.

This function is triggered via a Socket.IO event when the user sends a new chat message related to a previously processed PDF. It loads the relevant cached document data and chat history, configures the Gemini model, and streams the generated response back to the frontend in real time.

The function also updates the chat history in the cache after responding, enabling continued conversation with memory of previous exchanges.

Examples:

>>> handle_chat_message({
        "input": "What are the risks mentioned in the document?",
        "contentId": "content-pdf0_3",
        "output_size": "medium",
        "slider_value": 0.5,
        ...
    })

Parameters:

Name	Type	Description	Default
`data`	`dict`	A dictionary containing chat message data and UI parameters. Expected keys include: - "input": User's chat message (prompt) (str). - "contentId": The ID of the document container (str). - "output_size": Desired response length (str). - "choosen_model": Selected Gemini model (str). - "slider_value": Level of detail or verbosity (float or str). - "show_pages_checkbox": Whether to include page numbers (bool or str). - "change_length_checkbox": Whether the output size can be adjusted (bool or str).	required

Returns:

Type	Description
`None`	None. Results are emitted via Socket.IO: - "receive_chat_message": Streams chat responses to the client. - "error": Emits errors if input is invalid or processing fails. - "stream_stopped": Indicates the end of streaming or failure.

Source code in src/main.py

@socketio.on("send_chat_message")
def handle_chat_message(data: dict) -> None:  # noqa: C901
    """Handles incoming chat messages and generates a streamed response using cached document context.

    This function is triggered via a Socket.IO event when the user sends a new
    chat message related to a previously processed PDF. It loads the relevant
    cached document data and chat history, configures the Gemini model,
    and streams the generated response back to the frontend in real time.

    The function also updates the chat history in the cache after responding,
    enabling continued conversation with memory of previous exchanges.

    Examples:
        >>> handle_chat_message({
                "input": "What are the risks mentioned in the document?",
                "contentId": "content-pdf0_3",
                "output_size": "medium",
                "slider_value": 0.5,
                ...
            })

    Args:
        data: A dictionary containing chat message data and UI parameters. Expected keys include:
            - "input": User's chat message (prompt) (str).
            - "contentId": The ID of the document container (str).
            - "output_size": Desired response length (str).
            - "choosen_model": Selected Gemini model (str).
            - "slider_value": Level of detail or verbosity (float or str).
            - "show_pages_checkbox": Whether to include page numbers (bool or str).
            - "change_length_checkbox": Whether the output size can be adjusted (bool or str).

    Returns:
        None. Results are emitted via Socket.IO:
            - "receive_chat_message": Streams chat responses to the client.
            - "error": Emits errors if input is invalid or processing fails.
            - "stream_stopped": Indicates the end of streaming or failure.

    Raises:
        Does not raise exceptions directly. All exceptions are caught, logged,
        and emitted as error messages to the client.
    """

    log.info("Received user input. Start processing.")

    try:
        global streaming
        streaming = True

        # get data
        prompt = data.get("input")
        content_id = data.get("contentId")

        # get cached data
        cached_data = cache.get(content_id)
        if not cached_data:
            log.error(f"Validation Error: No cached data found for UUID: {content_id}.")
            socketio.emit(
                "error",
                {
                    "message": f"Could not load data for chat session '{content_id}'. It may have expired."
                },
            )
            streaming = False
            socketio.emit("stream_stopped")
            return

        pdf_name = cached_data.get("title") if cached_data else None
        chat_history = cached_data.get("chat_history", [])
        rag_doc_slider = str(data.get("ragDocSlider"))
        print("-" * 10, "CHAT HISTORY", "-" * 10)
        print(chat_history)

        output_size = str(data.get("output_size"))
        show_pages_checkbox = str(data.get("show_pages_checkbox"))
        choosen_model = str(
            data.get("choosen_model", "gemini-2.0-flash")
        )  # second arg = default model
        change_length_checkbox = str(data.get("change_length_checkbox"))
        enhancer_checkbox = str(data.get("prompt_enhancer"))
        slider_value = data.get("slider_value")

        if slider_value is not None:
            slider_value = float(slider_value)
        else:
            slider_value = 0.0

        if not prompt:
            log.error("No prompt provided by user")
            socketio.emit("error", {"message": "No input provided"})
            streaming = False
            socketio.emit("stream_stopped")
            return

        if not content_id:
            log.error(
                f"Content ID missing or cached data not found for ID: {content_id}"
            )
            socketio.emit("error", {"message": "No content ID for the chat provided"})
            streaming = False
            socketio.emit("stream_stopped")
            return

        if not pdf_name:
            log.error(f"PDF name not found in cache for content ID: {content_id}")
            socketio.emit("error", {"message": "No pdf name provided"})
            streaming = False
            socketio.emit("stream_stopped")
            return

        if not isinstance(chat_history, list):
            log.warning(
                f"Cached data for '{content_id}' contained 'chat_history' but it was not a list:"
                f"(type: {type(chat_history)}). Initializing as empty list."
            )

        # debug logs for each document
        log.debug(f"Prompt: {prompt}")
        log.debug(f"Content id: {content_id}")
        log.debug(f"Pdf name (from cache): {pdf_name}")
        log.debug(
            f"Initial Chat History (loaded/initialized): {len(chat_history)} messages"
        )
        log.debug(f"Output size: {output_size}")
        log.debug(f"Show pages: {show_pages_checkbox}")
        log.debug(f"Change output size: {change_length_checkbox}")
        log.debug(f"Selected model: {choosen_model}")
        log.debug(f"RAG or document: {rag_doc_slider}")
        log.debug(f"Prompt enhancer: {enhancer_checkbox}")

        # model instance inside the function to allow multiple models
        genai.configure(api_key=GEMINI_API_KEY)
        model = genai.GenerativeModel(
            choosen_model,
            system_instruction=show_pages(SYSTEM_PROMPT, show_pages_checkbox),
        )  # another models to be used: "gemini-2.0-flash-thinking-exp-01-21" "gemini-2.0-flash"

        try:
            if not streaming:
                socketio.emit("stream_stopped")
                log.info("Stream stopped before file processing.")
            pdf_name_to_show = pdf_name

            collection_name = cached_data.get("collection_name")
            if not collection_name:
                # fallback for old cache entries
                collection_name = generate_vector_db_document_name(
                    pdf_name, max_length=CHROMADB_MAX_FILENAME_LENGTH
                )

            # print("-" * 10, "COLLECTION NAME HANDLING CHAT MESSAGE", "-" * 10)
            # print(collection_name)

            accumulated_text = ""
            for result_chunk in process_chat_query_with_rag(
                prompt,
                chat_history,
                pdf_name_to_show,
                model,
                change_length_checkbox,
                enhancer_checkbox,
                output_size,
                slider_value,
                chroma_client,
                collection_name,
                rag_doc_slider,
            ):
                if not streaming:
                    log.info("Stopping chat processing due to streaming flag.")
                    break
                # Check the structure of the yielded chunk
                if "content" in result_chunk:
                    log.debug(f'Recived response chunk: {result_chunk["content"]}')
                    accumulated_text += result_chunk["content"]
                    chunk_text = result_chunk["content"]
                    socketio.emit("receive_chat_message", {"message": chunk_text})

                elif "error" in result_chunk:
                    error_message = result_chunk["error"]
                    log.error(
                        f"Error chunk from process_query_with_rag: {error_message}"
                    )
                    # --- Emit an error message to the frontend ---
                    socketio.emit("receive_chat_message", {"error": error_message})
                    # If an error occurs in a chunk, stop processing the rest of the stream
                    break
                else:
                    socketio.emit("error", {"message": "unexpected error"})
                    return
            if streaming:
                chat_history.append({"role": "user", "parts": [prompt]})
                chat_history.append({"role": "model", "parts": [accumulated_text]})
                cached_data["chat_history"] = (
                    chat_history  # <-- Assign the updated list back into the dictionary
                )
                cache.set(content_id, cached_data, timeout=3600)
                log.info(f"Stored updated data for {content_id} in cache.")

            if not streaming:
                socketio.emit("stream_stopped")
                log.info("Stream stopped during request processing.")

        except Exception as e:
            log.error(f"An error occurred in the generate function: {e}")
            traceback.print_exc()
            socketio.emit(
                "error", {"message": f"An unexpected error occurred: {str(e)}"}
            )

        streaming = False
        socketio.emit("stream_stopped")

    except Exception as e:
        log.error(f"An error occurred in the generate function: {e}")
        traceback.print_exc()
        socketio.emit("error", {"message": f"An unexpected error occurred: {str(e)}"})
        streaming = False

`handle_clear_cache()`

Clears all cached chat instances (UUIDs) created by the current client's session. Triggered by a button press on the main page.

Source code in src/main.py

@socketio.on("clear_cache")
def handle_clear_cache() -> None:
    """
    Clears all cached chat instances (UUIDs) created by the current client's session.
    Triggered by a button press on the main page.
    """
    global output_index
    sid = request.sid  # type: ignore[attr-defined]
    session_map_key = f"session_map_{sid}"
    session_content_ids = cache.get(session_map_key)

    if session_content_ids:
        log.info(f"Clear event for sid: {sid}. Clearing session's cached entries.")
        for container_id in session_content_ids:
            if cache.delete(container_id):
                log.info(f"Deleted cache for key: {container_id}")
        cache.delete(session_map_key)
        log.info(f"Deleted session map for sid: {sid}")
    else:
        log.info(f"Clear event for sid: {sid}. No session map found to clear.")

    output_index = -1
    log.info(f"Output index reset for sid: {sid}")

`handle_disconnect()`

Handles cache cleanup for all UUIDs created by a client's session.

Source code in src/main.py

@socketio.on("disconnect")
def handle_disconnect() -> None:
    """Handles cache cleanup for all UUIDs created by a client's session."""
    sid = request.sid  # type: ignore[attr-defined]
    session_map_key = f"session_map_{sid}"
    session_content_ids = cache.get(session_map_key)

    if session_content_ids:
        log.info(f"Disconnect event for sid: {sid}. Cleaning up cached entries.")
        for container_id in session_content_ids:
            if cache.delete(container_id):
                log.info(f"Deleted cache for key: {container_id}")
            else:
                log.warning(
                    f"Attempted to delete non-existent cache key: {container_id}"
                )

        cache.delete(session_map_key)
        log.info(f"Deleted session map for sid: {sid}")
    else:
        log.info(
            f"Disconnect event for sid: {sid}. No session map found, no cleanup needed."
        )

`handle_reset_chat_history(data)`

Finds a specific chat session by its UUID and resets its history, keeping only the first two messages (initial prompt and response).

Source code in src/main.py

@socketio.on("reset_chat_history")
def handle_reset_chat_history(data: dict) -> None:
    """
    Finds a specific chat session by its UUID and resets its history,
    keeping only the first two messages (initial prompt and response).
    """
    content_id = data.get("contentId")
    if not content_id:
        log.warning("Received reset_chat_history event without a contentId.")
        return

    log.info(f"Resetting chat history for UUID: {content_id}")

    cached_data = cache.get(content_id)

    if cached_data and "chat_history" in cached_data:
        cached_data["chat_history"] = cached_data["chat_history"][:2]
        cache.set(content_id, cached_data, timeout=3600)
        log.info(f"Successfully reset history for UUID: {content_id}")
        socketio.emit("history_reset_success", {"contentId": content_id})
    else:
        log.warning(f"Could not find data to reset for UUID: {content_id}")

`handle_stop()`

Stops the current processing stream when triggered by the client.

This function sets the global streaming flag to False, effectively stopping any ongoing data generation or response processing.

Examples:

None

Returns:

Type	Description
`None`	None

Source code in src/main.py

@socketio.on("stop_processing")
def handle_stop() -> None:
    """Stops the current processing stream when triggered by the client.

    This function sets the global streaming flag to False, effectively stopping
    any ongoing data generation or response processing.

    Examples:
        None

    Returns:
        None

    Raises:
        None
    """

    global streaming
    streaming = False
    log.info("Processing Stopped by User")

`index()`

Serves the main page of the application with a list of available PDF files.

This route handler is mapped to the root URL ("/"). When accessed, it logs that the application is running, scans the designated directory for PDF files, and renders the homepage template with those files listed.

This enables users to see which documents are available for analysis.

Examples:

None

Returns:

Type	Description
`str`	A rendered HTML page (index.html) with the following context:
`str`	pdf_files: A list of available PDF filenames in the scraped directory.

Source code in src/main.py

@app.route("/")
def index() -> str:
    """Serves the main page of the application with a list of available PDF files.

    This route handler is mapped to the root URL ("/"). When accessed, it logs
    that the application is running, scans the designated directory for PDF files,
    and renders the homepage template with those files listed.

    This enables users to see which documents are available for analysis.

    Examples:
        None

    Returns:
        A rendered HTML page (index.html) with the following context:
        - pdf_files: A list of available PDF filenames in the scraped directory.

    Raises:
        None directly
    """

    log.info("App is up")
    pdf_dir = Path(SCRAPED_FILES_DIR)
    pdf_files = [pdf.name for pdf in pdf_dir.glob("*.pdf")] if pdf_dir.exists() else []
    pdf_files = sorted(pdf_files, key=lambda x: extract_title_from_filename(x).lower())
    pdf_titles = {pdf: extract_title_from_filename(pdf) for pdf in pdf_files}
    return render_template("index.html", pdf_files=pdf_files, pdf_titles=pdf_titles)

`process_text(data)`

Handles the initial processing of user input and selected PDFs using a generative model.

This function is triggered via a Socket.IO event when a user initiates processing. It validates the input prompt and selected PDF files, sets up the selected Gemini model, and processes each PDF using retrieval-augmented generation (RAG). The response is streamed back to the client in real time, rendered in markdown, and cached for future access.

Each processed PDF results in the creation of a content container, which is dynamically sent to the frontend. If errors occur during processing, they are logged and sent to the client as error events.

Examples:

Triggered internally by Socket.IO when the user starts processing:

>>> process_text({
        "input": "Summarize the risks mentioned",
        "pdfFiles": ["KNF_2022_01.pdf"],
        "output_size": "short",
        "show_pages_checkbox": True,
        "choosen_model": "gemini-2.0-flash",
        ...
    })

Parameters:

Name	Type	Description	Default
`data`	`dict`	A dictionary containing user input and options. Expected keys include: - "input": User’s prompt (str). - "pdfFiles": List of PDF filenames to process (List[str]). - "output_size": Approximate length of the response (str). - "show_pages_checkbox": Whether to include page numbers (bool or str). - "choosen_model": Selected Gemini model (str). - "change_length_checkbox": Whether output length can vary (bool or str). - "slider_value": Float controlling verbosity or detail (str or float). - "ragDocSlider": Toggle between RAG and document mode (str). - Other UI flags or settings.	required

Returns:

Type	Description
`None`	None. Results are streamed to the client via Socket.IO events: - "new_container": Sends a new HTML container for each PDF. - "update_content": Streams chunks of the model's response. - "processing_complete_for_container": Signals PDF completion. - "error": Sends error messages if validation or processing fails. - "stream_stopped": Indicates the end of the streaming session.

Source code in src/main.py

@socketio.on("start_processing")
def process_text(data: dict) -> None:  # noqa: C901
    """Handles the initial processing of user input and selected PDFs using a generative model.

    This function is triggered via a Socket.IO event when a user initiates processing.
    It validates the input prompt and selected PDF files, sets up the selected
    Gemini model, and processes each PDF using retrieval-augmented generation (RAG).
    The response is streamed back to the client in real time, rendered in markdown,
    and cached for future access.

    Each processed PDF results in the creation of a content container, which is
    dynamically sent to the frontend. If errors occur during processing, they are
    logged and sent to the client as error events.

    Examples:
        # Triggered internally by Socket.IO when the user starts processing:
        >>> process_text({
                "input": "Summarize the risks mentioned",
                "pdfFiles": ["KNF_2022_01.pdf"],
                "output_size": "short",
                "show_pages_checkbox": True,
                "choosen_model": "gemini-2.0-flash",
                ...
            })

    Args:
        data: A dictionary containing user input and options. Expected keys include:
            - "input": User’s prompt (str).
            - "pdfFiles": List of PDF filenames to process (List[str]).
            - "output_size": Approximate length of the response (str).
            - "show_pages_checkbox": Whether to include page numbers (bool or str).
            - "choosen_model": Selected Gemini model (str).
            - "change_length_checkbox": Whether output length can vary (bool or str).
            - "slider_value": Float controlling verbosity or detail (str or float).
            - "ragDocSlider": Toggle between RAG and document mode (str).
            - Other UI flags or settings.

    Returns:
        None. Results are streamed to the client via Socket.IO events:
            - "new_container": Sends a new HTML container for each PDF.
            - "update_content": Streams chunks of the model's response.
            - "processing_complete_for_container": Signals PDF completion.
            - "error": Sends error messages if validation or processing fails.
            - "stream_stopped": Indicates the end of the streaming session.

    Raises:
        Emits error events instead of raising exceptions directly.
        Internal exceptions are caught, logged, and passed to the client as messages.
    """

    log.info("Started input processing")

    try:
        global output_index
        output_index += 1
        global streaming
        streaming = True
        sid = request.sid  # type: ignore[attr-defined]

        log.info(f"SID start_processing main page: {request.sid}")  # type: ignore[attr-defined]
        # get data
        prompt = data.get("input")
        selected_files = data.get("pdfFiles")
        output_size = data.get("output_size")
        show_pages_checkbox = str(data.get("show_pages_checkbox"))
        choosen_model = str(
            data.get("choosen_model", "gemini-2.0-flash")
        )  # second arg = default model
        change_length_checkbox = str(data.get("change_length_checkbox"))
        enhancer_checkbox = str(data.get("prompt_enhancer"))
        slider_value = data.get("slider_value")
        rag_doc_slider = str(data.get("ragDocSlider"))

        if slider_value is not None:
            slider_value = float(slider_value)
        else:
            slider_value = 0.0

        if not prompt:
            log.error("No prompt provided")
            socketio.emit("error", {"message": "No input provided"})
            streaming = False
            socketio.emit("stream_stopped")
            return

        if not selected_files or selected_files == []:
            log.error("no selected files")
            socketio.emit("error", {"message": "No files selected"})
            streaming = False
            socketio.emit("stream_stopped")
            return

        output_size = str(output_size)

        # debug logs for each document
        log.debug(f"prompt: {prompt}")
        log.debug(f"selected files: {selected_files}")
        log.debug(f"output size: {output_size}")
        log.debug(f"Show pages: {show_pages_checkbox}")
        log.debug(f"Change output size: {change_length_checkbox}")
        log.debug(f"selected_model: {choosen_model}")
        log.debug(f"Prompt enhancer: {enhancer_checkbox}")
        log.debug(f"RAG or document: {rag_doc_slider}")

        # files
        pdf_dir = Path(SCRAPED_FILES_DIR)

        pdfs_to_scan = [pdf_dir / file_name for file_name in selected_files]

        for pdf in pdfs_to_scan:
            log.info(pdf)

        # model instance inside the function to allow multiple models
        genai.configure(api_key=GEMINI_API_KEY)
        model = genai.GenerativeModel(
            choosen_model,
            system_instruction=show_pages(SYSTEM_PROMPT, show_pages_checkbox),
        )  # another models to be used: "gemini-2.0-flash-thinking-exp-01-21" "gemini-2.0-flash"

        try:
            for index, pdf in enumerate(pdfs_to_scan):
                if not streaming:
                    break
                # document title extraction
                pdf_parts = pdf.stem.split("_", 2)
                if len(pdf_parts) == 3:
                    doc_id, timestamp, title = pdf_parts
                    pdf_name_to_show = title.lstrip("_").rstrip("_")
                else:
                    pdf_name_to_show = pdf.stem.lstrip("_").rstrip("_")  # fallback

                container_id = str(uuid.uuid4())
                log.info(f"Generated unique container ID (UUID): {container_id}")

                container_html = render_template(
                    "output.html",
                    container_title=pdf_name_to_show,
                    container_id=container_id,
                )

                log.info(f"New container created for: {pdf_name_to_show}")
                socketio.emit("new_container", {"html": container_html})

                collection_name = generate_vector_db_document_name(
                    pdf.stem, max_length=CHROMADB_MAX_FILENAME_LENGTH
                )
                print("-" * 10, "COLLECTION NAME", "-" * 10)
                print(collection_name)

                accumulated_text = ""

                for result_chunk in process_query_with_rag(
                    prompt,
                    pdf_name_to_show,
                    model,
                    change_length_checkbox,
                    enhancer_checkbox,
                    output_size,
                    slider_value,
                    chroma_client,
                    collection_name,
                    rag_doc_slider,
                ):
                    if not streaming:
                        break
                    if "error" in result_chunk:
                        log.error("Error received in chunk")
                        error_message = {"message": "error in chunk response"}
                        socketio.emit("error", error_message)
                        return
                    elif "content" in result_chunk:
                        log.debug(f'Received response chunk: {result_chunk["content"]}')
                        accumulated_text += result_chunk["content"]
                        markdown_content = markdown.markdown(accumulated_text)
                        final_markdown_content = (
                            markdown_content  # Keep track of the latest full content
                        )
                        socketio.emit(
                            "update_content",
                            {
                                "container_id": container_id,
                                "html": markdown_content,
                            },
                        )
                    else:
                        socketio.emit("error", {"message": "unexpected error"})
                        return
                if streaming:
                    chat_history = [
                        {"role": "user", "parts": [prompt]},
                        {"role": "model", "parts": [accumulated_text]},
                    ]
                    data_to_cache = {
                        "title": pdf_name_to_show,
                        "content": final_markdown_content,
                        "chat_history": chat_history,
                        "collection_name": collection_name,
                    }
                    # Set a timeout (e.g., 1 hour = 3600 seconds)
                    cache.set(container_id, data_to_cache, timeout=3600)
                    log.info(f"Stored content for unique key {container_id} in cache.")
                    log.info(
                        f"Initial processing complete for container ID: {container_id}."
                        "Emitting completion signal."
                    )
                    session_map_key = f"session_map_{sid}"
                    session_content_ids = cache.get(session_map_key) or []
                    if container_id not in session_content_ids:
                        session_content_ids.append(container_id)
                        cache.set(session_map_key, session_content_ids, timeout=3600)
                        log.info(f"Added {container_id} to session map for sid: {sid}")
                    # Emit a custom event indicating completion for THIS container
                    socketio.emit(
                        "processing_complete_for_container",
                        {"container_id": container_id},
                    )
                else:
                    log.info(
                        f"Processing stopped for {pdf_name_to_show} ({container_id}). Not emitting completion signal."
                    )

            if not streaming:
                socketio.emit("stream_stopped")
                log.info("Stream stopped during file processing.")
        except Exception as e:
            log.error(f"An error occurred in the generate function: {e}")
            traceback.print_exc()
            socketio.emit(
                "error", {"message": f"An unexpected error occurred: {str(e)}"}
            )

        streaming = False
        socketio.emit("stream_stopped")

    except Exception as e:
        log.error(f"An error occurred in the generate function: {e}")
        traceback.print_exc()
        socketio.emit("error", {"message": f"An unexpected error occurred: {str(e)}"})
        streaming = False

`extract_text_from_pdf(pdf_path)`

Extracts text from a PDF file.

This function reads a PDF file, extracts text from each page, and concatenates the extracted text into a single string. If an error occurs during extraction, it logs the error and returns an empty string.

Examples:

>>> extract_text_from_pdf(Path("document.pdf"))
'This is the extracted text from the PDF document.'

Parameters:

Name	Type	Description	Default
`pdf_path`	`Path`	A Path object representing the PDF file to extract text from.	required

Returns:

Type	Description
	A string containing the extracted text from the PDF. If an error occurs,
	an empty string is returned.

Raises:

Type	Description
`Exception`	Any exceptions encountered while reading or extracting text

Source code in src/backend/extract_text.py

def extract_text_from_pdf(pdf_path: Path):  # type: ignore
    """Extracts text from a PDF file.

    This function reads a PDF file, extracts text from each page, and concatenates
    the extracted text into a single string. If an error occurs during extraction,
    it logs the error and returns an empty string.

    Examples:
        >>> extract_text_from_pdf(Path("document.pdf"))
        'This is the extracted text from the PDF document.'

    Args:
        pdf_path: A Path object representing the PDF file to extract text from.

    Returns:
        A string containing the extracted text from the PDF. If an error occurs,
        an empty string is returned.

    Raises:
        Exception: Any exceptions encountered while reading or extracting text
        from the PDF are logged and handled.
    """

    try:
        pages_list = []
        with pdfplumber.open(pdf_path) as pdf:
            for page in pdf.pages:
                page_text = page.extract_text()
                if page_text:  # some pages may return None
                    pages_list.append(page_text)
        return pages_list

    except Exception as e:
        log.error(f"Error processing {pdf_path}: {str(e)}")
        traceback.print_exc()
        return ""

`scrape_knf(scraped_dir, num_retries, user_agent_list)`

Scrapes pdf files from KNF url.

This function scrapes pdf files from a KNF url. For a certain number of tries it masks under an agent from given agent list and downloads the file into a directory. It adds a document ID If an error occurs during scraping, it logs the error message.

Examples:

>>> scrape_knf(10, ["Mozilla/5.0", "Mozilla/4.0"])
None

Parameters:

Name	Type	Description	Default
`num_retries`	`int`	An int describing number of retries the program will	required
`user_agent_list`	`list`	A list of strings with user agents for masking.	required

Returns:

Type	Description
`None`	None

Raises:

Type	Description
`Exception`	If an error occurs during processing, it is logged and returned

Source code in src/backend/knf_scraping.py

def scrape_knf(scraped_dir: Path, num_retries: int, user_agent_list: list) -> None:
    """Scrapes pdf files from KNF url.

    This function scrapes pdf files from a KNF url.
    For a certain number of tries it masks under an agent from given agent list
    and downloads the file into a directory. It adds a document ID If an error occurs during scraping,
    it logs the error message.

    Examples:
        >>> scrape_knf(10, ["Mozilla/5.0", "Mozilla/4.0"])
        None

    Args:
        num_retries: An int describing number of retries the program will
        attempt of scraping a file.
        user_agent_list: A list of strings with user agents for masking.

    Returns:
        None

    Raises:
        Exception: If an error occurs during processing, it is logged and returned
        in the response dictionary.
    """

    knf_base_url = "https://www.knf.gov.pl"
    knf_recommendations_url = (
        f"{knf_base_url}/dla_rynku/regulacje_i_praktyka/rekomendacje_i"
        + "_wytyczne/rekomendacje_dla_bankow?articleId=8522&p_id=18"
    )

    scraped_dir.mkdir(parents=True, exist_ok=True)

    response = None
    for _ in range(num_retries):
        try:
            headers = {
                "User-Agent": user_agent_list[
                    random.randint(0, len(user_agent_list) - 1)
                ]
            }
            response = requests.get(knf_recommendations_url, headers=headers)
            if response.status_code in [200, 404]:
                break  # escape loop if response was successful
        except requests.exceptions.ConnectionError:
            log.error("Connection failed, retrying...")

    if response and response.status_code == 200:
        soup = BeautifulSoup(response.content, "html.parser")

        time_html_tag = soup.time
        datetime_atr = time_html_tag["datetime"]  # type: ignore[index]

        pdf_titles_links = {}
        for link in soup.find_all(
            "a", title=lambda x: x and "rekomendacja" in x.lower()
        ):
            try:
                href = link.get("href")
                title = link.get_text(strip=True)
                if href and href.endswith(".pdf"):
                    # not all .pdf are listed on knf.gov.pl
                    if "https" not in href:
                        full_url = knf_base_url + href
                        pdf_titles_links[title] = full_url
                    else:
                        pdf_titles_links[title] = href
            except Exception as e:
                log.error(f"Problem with link for {link} \n Error messange: {e}\n")

        for title, url in pdf_titles_links.items():
            if len(title) > 1:  # temporary: scraping needs deeper rework;
                try:
                    pdf_response = requests.get(url, headers=headers)
                    safe_title = windows_safe_filename(title) if title else "unknown"
                    entry = get_or_assign_id(safe_title)
                    # adding datetime from KNF site to file name
                    assert isinstance(datetime_atr, str)
                    date_str = datetime_atr[:10].replace("-", "")
                    unique_filename = f"{entry['id']}_{date_str}_{safe_title}.pdf"
                    pdf_path = scraped_dir / unique_filename
                    with open(pdf_path, "wb") as f:
                        f.write(pdf_response.content)
                        log.debug(f"Downloaded: {pdf_path}")
                except Exception as e:
                    log.error(f"PDF not downloaded: {url} \n Error messange: {e}\n")
                    traceback.print_exc()
    else:
        log.error("Failed to retrieve the main page content after retries.")

`windows_safe_filename(filename)`

Removes invalid characters from the file name.

This function removes any invalid character in Windows file name from the filename.
It also replaces end of line characters "

" with spaces " ". Returns the new file name.

Examples:
    >>> windows_safe_filename('invalid:filename?.txt')
    'invalidfilename.txt'

Args:
    filename: A string containing the file name.

Returns:
    A string containing file name cleansed from any invalid characters.

Raises:
    None

Source code in src/backend/knf_scraping.py

def windows_safe_filename(filename: str) -> str:
    """Removes invalid characters from the file name.

    This function removes any invalid character in Windows file name from the filename.
    It also replaces end of line characters "\n" with spaces " ".
    Returns the new file name.

    Examples:
        >>> windows_safe_filename('invalid:filename?.txt')
        'invalidfilename.txt'

    Args:
        filename: A string containing the file name.

    Returns:
        A string containing file name cleansed from any invalid characters.

    Raises:
        None
    """
    filename = filename.replace("\n", " ")
    filename = re.sub(
        r'[<>:"/\\|?*]', "", filename
    )  # <>:"/\|?* are invalid characters in Windows file names
    return filename

`process_chat_query_with_rag(prompt, chat_history, pdf_name, model, change_length_checkbox, enhancer_checkbox, output_size, temperature_slider_value, chroma_client, collection_name, rag_doc_slider)`

Processes a chat query using RAG, incorporating conversation history.

Retrieves relevant context from a document collection based on the current user prompt. It then combines the (potentially enhanced) prompt with this context and the existing chat history, queries the generative model, and streams the response.

Parameters:

Name	Type	Description	Default
`prompt`	`str`	The user's current query/message in the conversation.	required
`chat_history`	`str`	A string representation of the conversation history. (Note: Assumes model.start_chat() accepts this string format).	required
`pdf_name`	`str`	The identifier/name of the document (for RAG context and logging).	required
`model`	`GenerativeModel`	The generative AI model instance (e.g., genai.GenerativeModel).	required
`change_length_checkbox`	`str`	String flag ("True"/"False") to modify response length.	required
`enhancer_checkbox`	`str`	String flag ("True"/"False") for prompt enhancement.	required
`output_size`	`str`	The desired output size (e.g., number of words).	required
`temperature_slider_value`	`float`	Temperature for model generation.	required
`chroma_client`	`Client`	The ChromaDB client instance.	required
`collection_name`	`str`	Name of the ChromaDB collection for this document.	required
`rag_doc_slider`	`str`	String flag ("True" to use all chunks from the document's collection, "False" for a default number).	required

Yields:

Name	Type	Description
`dict`	`Any`	A dictionary for each chunk of the response or for an error. For content: `{"pdf_name": str, "content": str}`. For error: `{"error": str}`.

Source code in src/backend/process_query.py

def process_chat_query_with_rag(
    prompt: str,
    chat_history: str,
    pdf_name: str,
    model: genai.GenerativeModel,
    change_length_checkbox: str,
    enhancer_checkbox: str,
    output_size: str,
    temperature_slider_value: float,
    chroma_client: ChromaClient,
    collection_name: str,
    rag_doc_slider: str,
) -> Any:
    """
    Processes a chat query using RAG, incorporating conversation history.

    Retrieves relevant context from a document collection based on the current
    user prompt. It then combines the (potentially enhanced) prompt with this
    context and the existing chat history, queries the generative model,
    and streams the response.

    Args:
        prompt: The user's current query/message in the conversation.
        chat_history: A string representation of the conversation history.
                      (Note: Assumes model.start_chat() accepts this string format).
        pdf_name: The identifier/name of the document (for RAG context and logging).
        model: The generative AI model instance (e.g., genai.GenerativeModel).
        change_length_checkbox: String flag ("True"/"False") to modify response length.
        enhancer_checkbox: String flag ("True"/"False") for prompt enhancement.
        output_size: The desired output size (e.g., number of words).
        temperature_slider_value: Temperature for model generation.
        chroma_client: The ChromaDB client instance.
        collection_name: Name of the ChromaDB collection for this document.
        rag_doc_slider: String flag ("True" to use all chunks from the document's
                        collection, "False" for a default number).

    Yields:
        dict: A dictionary for each chunk of the response or for an error.
              For content: `{"pdf_name": str, "content": str}`.
              For error: `{"error": str}`.
    """

    if not prompt:
        yield {"error": "No prompt provided"}
        return

    rag_context = _get_rag_context(
        prompt=prompt,
        pdf_name=pdf_name,
        chroma_client=chroma_client,
        collection_name=collection_name,
        rag_doc_slider=rag_doc_slider,
        embedding_function=get_gemini_ef(),
    )
    log.debug(f"Context for {pdf_name} (chat query):\n{rag_context}\n")

    final_llm_prompt = _build_final_llm_prompt(
        base_prompt=prompt,
        change_length_flag=change_length_checkbox,
        output_size=output_size,
        enhancer_flag=enhancer_checkbox,
        model=model,
        identifier=pdf_name,
        rag_context=rag_context,
        chat_history=chat_history,
    )

    try:
        log.info(
            f"Generating chat response for query on '{pdf_name}' with prompt: "
            f"'{final_llm_prompt[:200]}...'"
        )
        chat = model.start_chat(history=chat_history)  # type: ignore[arg-type]
        response = chat.send_message(
            [final_llm_prompt],
            stream=True,
            generation_config=genai.types.GenerationConfig(
                temperature=temperature_slider_value
            ),
        )

        for response_chunk in response:
            # replace -> sometimes double space between words occure; most likely reason: pdf formating
            response_chunk_text = response_chunk.text.replace("  ", " ")
            yield {"pdf_name": pdf_name, "content": response_chunk_text}
            time.sleep(STREAM_RESPONSE_CHUNK_DELAY_SECONDS)
        log.debug(f"Response for: {pdf_name} was saved!\n")
        time.sleep(POST_PROCESS_DELAY_SECONDS)  # lower API request rate per sec
    except Exception as e:
        log.error(f"There is a problem with {pdf_name}. \n Error message: {e}\n")
        traceback.print_exc()
        yield {"error": f"An error occurred while processing {pdf_name}: {str(e)}"}

`process_pdf(prompt, pdf, model, change_length_checkbox, enhancer_checkbox, output_size, temperature_slider_value)`

Uploads a PDF, processes it with a generative model, and streams content.

This function takes a PDF file and a prompt, uploads the file, and then calls the generative model to process the content based on the (potentially enhanced) prompt. It streams the model's response, yielding cleaned text chunks or an error dictionary.

Parameters:

Name	Type	Description	Default
`prompt`	`str`	The base prompt for processing the document.	required
`pdf`	`Path`	A Path object representing the PDF file to be processed.	required
`model`	`GenerativeModel`	The generative AI model instance (e.g., genai.GenerativeModel).	required
`change_length_checkbox`	`str`	String flag ("True"/"False") to indicate if output size instruction should be added.	required
`enhancer_checkbox`	`str`	String flag ("True"/"False") to indicate if the prompt should be enhanced.	required
`output_size`	`str`	The desired output size (e.g., number of words).	required
`temperature_slider_value`	`float`	The temperature setting for model generation.	required

Yields:

Name	Type	Description
`dict`	`Any`	A dictionary for each chunk of the response or for an error. For content: `{"pdf_name": str, "content": str}` For error: `{"error": str}`.

Source code in src/backend/process_query.py

def process_pdf(
    prompt: str,
    pdf: Path,
    model: genai.GenerativeModel,
    change_length_checkbox: str,
    enhancer_checkbox: str,
    output_size: str,
    temperature_slider_value: float,
) -> Any:
    """
    Uploads a PDF, processes it with a generative model, and streams content.

    This function takes a PDF file and a prompt, uploads the file,
    and then calls the generative model to process the content based on
    the (potentially enhanced) prompt. It streams the model's response,
    yielding cleaned text chunks or an error dictionary.

    Args:
        prompt: The base prompt for processing the document.
        pdf: A Path object representing the PDF file to be processed.
        model: The generative AI model instance (e.g., genai.GenerativeModel).
        change_length_checkbox: String flag ("True"/"False") to indicate if
                                output size instruction should be added.
        enhancer_checkbox: String flag ("True"/"False") to indicate if the
                           prompt should be enhanced.
        output_size: The desired output size (e.g., number of words).
        temperature_slider_value: The temperature setting for model generation.

    Yields:
        dict: A dictionary for each chunk of the response or for an error.
              For content: `{"pdf_name": str, "content": str}`
              For error: `{"error": str}`.
    """

    if not prompt:
        yield {"error": "No prompt provided"}
        return

    try:
        log.info(f"Document: {pdf.stem} is beeing analyzed.")
        file_to_send = genai.upload_file(pdf)
        log.debug(f"PDF uploaded successfully. File metadata: {file_to_send}\n")

        final_llm_prompt_for_model = _build_final_llm_prompt(
            base_prompt=prompt,
            change_length_flag=change_length_checkbox,
            output_size=output_size,
            enhancer_flag=enhancer_checkbox,
            model=model,
            identifier=pdf.stem,
        )
        response = model.generate_content(
            [final_llm_prompt_for_model, file_to_send],
            stream=True,
            generation_config={"temperature": temperature_slider_value},
        )

        for response_chunk in response:
            # replace -> sometimes double space between words occure; most likely reason: pdf formating
            response_chunk_text = response_chunk.text.replace("  ", " ")
            yield {"pdf_name": pdf, "content": response_chunk_text}
            time.sleep(STREAM_RESPONSE_CHUNK_DELAY_SECONDS)
        log.debug(f"Response for: {pdf} was saved!\n")
        time.sleep(POST_PROCESS_DELAY_SECONDS)  # lower API request rate per sec
    except Exception as e:
        log.error(f"There is a problem with {pdf.stem}. \n Error message: {e}\n")
        traceback.print_exc()
        yield {"error": f"An error occurred while processing {pdf.stem}: {str(e)}"}

`process_query_with_rag(prompt, pdf_name, model, change_length_checkbox, enhancer_checkbox, output_size, temperature_slider_value, chroma_client, collection_name, rag_doc_slider)`

Processes a query using RAG, combining it with context from a document.

This function retrieves relevant context from a specified document collection (via ChromaDB) based on the user's prompt. It then combines the (potentially enhanced) prompt with this context and queries the generative model, streaming the response.

Parameters:

Name	Type	Description	Default
`prompt`	`str`	The user's base query/prompt.	required
`pdf_name`	`str`	The identifier/name of the document (for RAG context and logging).	required
`model`	`GenerativeModel`	The generative AI model instance (e.g., genai.GenerativeModel).	required
`change_length_checkbox`	`str`	String flag ("True"/"False") to modify response length.	required
`enhancer_checkbox`	`str`	String flag ("True"/"False") for prompt enhancement.	required
`output_size`	`str`	The desired output size (e.g., number of words).	required
`temperature_slider_value`	`float`	Temperature for model generation.	required
`chroma_client`	`Client`	The ChromaDB client instance.	required
`collection_name`	`str`	Name of the ChromaDB collection for this document.	required
`rag_doc_slider`	`str`	String flag ("True" to use all chunks from the document's collection, "False" for a default number).	required

Yields:

Name	Type	Description
`dict`	`Any`	A dictionary for each chunk of the response or for an error. For content: `{"pdf_name": str, "content": str}`. For error: `{"error": str}`.

Source code in src/backend/process_query.py

def process_query_with_rag(
    prompt: str,
    pdf_name: str,
    model: genai.GenerativeModel,
    change_length_checkbox: str,
    enhancer_checkbox: str,
    output_size: str,
    temperature_slider_value: float,
    chroma_client: ChromaClient,
    collection_name: str,
    rag_doc_slider: str,
) -> Any:
    """
    Processes a query using RAG, combining it with context from a document.

    This function retrieves relevant context from a specified document collection
    (via ChromaDB) based on the user's prompt. It then combines the
    (potentially enhanced) prompt with this context and queries the generative
    model, streaming the response.

    Args:
        prompt: The user's base query/prompt.
        pdf_name: The identifier/name of the document (for RAG context and logging).
        model: The generative AI model instance (e.g., genai.GenerativeModel).
        change_length_checkbox: String flag ("True"/"False") to modify response length.
        enhancer_checkbox: String flag ("True"/"False") for prompt enhancement.
        output_size: The desired output size (e.g., number of words).
        temperature_slider_value: Temperature for model generation.
        chroma_client: The ChromaDB client instance.
        collection_name: Name of the ChromaDB collection for this document.
        rag_doc_slider: String flag ("True" to use all chunks from the document's
                        collection, "False" for a default number).

    Yields:
        dict: A dictionary for each chunk of the response or for an error.
              For content: `{"pdf_name": str, "content": str}`.
              For error: `{"error": str}`.
    """

    if not prompt:
        yield {"error": "No prompt provided"}
        return

    rag_context = _get_rag_context(
        prompt=prompt,
        pdf_name=pdf_name,
        chroma_client=chroma_client,
        collection_name=collection_name,
        rag_doc_slider=rag_doc_slider,
        embedding_function=get_gemini_ef(),
    )
    log.debug(f"Context for {pdf_name}:\n{rag_context}\n")

    final_llm_prompt = _build_final_llm_prompt(
        base_prompt=prompt,
        change_length_flag=change_length_checkbox,
        output_size=output_size,
        enhancer_flag=enhancer_checkbox,
        model=model,
        identifier=pdf_name,
        rag_context=rag_context,
    )

    try:
        response = model.generate_content(
            [final_llm_prompt],
            stream=True,
            generation_config={"temperature": temperature_slider_value},
        )
        for response_chunk in response:
            # replace -> sometimes double space between words occure; most likely reason: pdf formating
            response_chunk_text = response_chunk.text.replace("  ", " ")
            yield {"pdf_name": pdf_name, "content": response_chunk_text}
            time.sleep(STREAM_RESPONSE_CHUNK_DELAY_SECONDS)
        log.debug(f"Response for: {pdf_name} was saved!\n")
        time.sleep(POST_PROCESS_DELAY_SECONDS)  # lower API request rate per sec
    except Exception as e:
        log.error(f"There is a problem with {pdf_name}. \n Error message: {e}\n")
        traceback.print_exc()
        yield {"error": f"An error occurred while processing {pdf_name}: {str(e)}"}

`show_pages(system_prompt, show_pages_checkbox)`

Add an additional string to the system prompt to instruct model to include page name in outputs.

Source code in src/backend/show_pages.py

def show_pages(system_prompt: str, show_pages_checkbox: str) -> str:
    """
    Add an additional string to the system prompt to instruct model to include page name in outputs.
    """
    if show_pages_checkbox == "True":  # request can be used only inside the function
        log.debug(system_prompt + OPTIONAL_PAGE_NUMBER_SP)
        return system_prompt + OPTIONAL_PAGE_NUMBER_SP
    else:
        log.debug(system_prompt)
        return system_prompt