Skip to content

Reference

document_chat()

Serves the document chat page using cached content based on the provided content ID.

Retrieves cached data (title and content) for a given contentId passed as a query parameter and renders a chat interface for continued conversation with the document.

Examples:

None

Returns:

Type Description
Any

Rendered HTML page (documentChat.html) with:

Any
  • content_id: The ID of the requested content.
Any
  • container_title_chat: The title of the document.
Any
  • content_chat: The previously generated content or an error message.
Source code in src/main.py
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
@app.route("/documentChat")
def document_chat() -> Any:
    """Serves the document chat page using cached content based on the provided content ID.

    Retrieves cached data (title and content) for a given contentId passed as a query parameter
    and renders a chat interface for continued conversation with the document.

    Examples:
        None

    Returns:
        Rendered HTML page (documentChat.html) with:
        - content_id: The ID of the requested content.
        - container_title_chat: The title of the document.
        - content_chat: The previously generated content or an error message.

    Raises:
        None
    """

    content_id = request.args.get("contentId")  # Get ID from URL query ?contentId=...
    log.info(f"Langchain chat request for contentId: {content_id}")

    # Retrieve data from cache
    cached_data = cache.get(content_id)
    log.debug(f"Cache lookup for {content_id} returned: {type(cached_data)}")

    if cached_data:
        container_title_chat = cached_data.get("title", "Unknown Title")
        content_chat = cached_data.get("content", "<p>Content not found.</p>")
        chat_history = cached_data.get("chat_history", [])
        log.info(f"Found content for {content_id} in cache.")
        log.info(f"Found {len(chat_history)} messages in history for {content_id}.")

        for message in chat_history:
            if message.get("role") == "model" and message.get("parts"):
                # Convert the raw markdown in 'parts' to HTML
                raw_markdown = message["parts"][0]
                message["parts"][0] = markdown.markdown(raw_markdown)
    else:
        container_title_chat = "Error"
        content_chat = f"<p>Could not find content for ID: {content_id}. Cache might be empty or ID is invalid.</p>"
        chat_history = []
        log.warning(f"Content for {content_id} not found in cache.")

    return render_template(
        "documentChat.html",
        content_id=content_id,
        container_title_chat=container_title_chat,
        content_chat=content_chat,
        chat_history=chat_history,
    )

handle_chat_message(data)

Handles incoming chat messages and generates a streamed response using cached document context.

This function is triggered via a Socket.IO event when the user sends a new chat message related to a previously processed PDF. It loads the relevant cached document data and chat history, configures the Gemini model, and streams the generated response back to the frontend in real time.

The function also updates the chat history in the cache after responding, enabling continued conversation with memory of previous exchanges.

Examples:

>>> handle_chat_message({
        "input": "What are the risks mentioned in the document?",
        "contentId": "content-pdf0_3",
        "output_size": "medium",
        "slider_value": 0.5,
        ...
    })

Parameters:

Name Type Description Default
data dict

A dictionary containing chat message data and UI parameters. Expected keys include: - "input": User's chat message (prompt) (str). - "contentId": The ID of the document container (str). - "output_size": Desired response length (str). - "choosen_model": Selected Gemini model (str). - "slider_value": Level of detail or verbosity (float or str). - "show_pages_checkbox": Whether to include page numbers (bool or str). - "change_length_checkbox": Whether the output size can be adjusted (bool or str).

required

Returns:

Type Description
None

None. Results are emitted via Socket.IO: - "receive_chat_message": Streams chat responses to the client. - "error": Emits errors if input is invalid or processing fails. - "stream_stopped": Indicates the end of streaming or failure.

Source code in src/main.py
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
@socketio.on("send_chat_message")
def handle_chat_message(data: dict) -> None:  # noqa: C901
    """Handles incoming chat messages and generates a streamed response using cached document context.

    This function is triggered via a Socket.IO event when the user sends a new
    chat message related to a previously processed PDF. It loads the relevant
    cached document data and chat history, configures the Gemini model,
    and streams the generated response back to the frontend in real time.

    The function also updates the chat history in the cache after responding,
    enabling continued conversation with memory of previous exchanges.

    Examples:
        >>> handle_chat_message({
                "input": "What are the risks mentioned in the document?",
                "contentId": "content-pdf0_3",
                "output_size": "medium",
                "slider_value": 0.5,
                ...
            })

    Args:
        data: A dictionary containing chat message data and UI parameters. Expected keys include:
            - "input": User's chat message (prompt) (str).
            - "contentId": The ID of the document container (str).
            - "output_size": Desired response length (str).
            - "choosen_model": Selected Gemini model (str).
            - "slider_value": Level of detail or verbosity (float or str).
            - "show_pages_checkbox": Whether to include page numbers (bool or str).
            - "change_length_checkbox": Whether the output size can be adjusted (bool or str).

    Returns:
        None. Results are emitted via Socket.IO:
            - "receive_chat_message": Streams chat responses to the client.
            - "error": Emits errors if input is invalid or processing fails.
            - "stream_stopped": Indicates the end of streaming or failure.

    Raises:
        Does not raise exceptions directly. All exceptions are caught, logged,
        and emitted as error messages to the client.
    """

    log.info("Received user input. Start processing.")

    try:
        global streaming
        streaming = True

        # get data
        prompt = data.get("input")
        content_id = data.get("contentId")

        # get cached data
        cached_data = cache.get(content_id)
        if not cached_data:
            log.error(f"Validation Error: No cached data found for UUID: {content_id}.")
            socketio.emit(
                "error",
                {
                    "message": f"Could not load data for chat session '{content_id}'. It may have expired."
                },
            )
            streaming = False
            socketio.emit("stream_stopped")
            return

        pdf_name = cached_data.get("title") if cached_data else None
        chat_history = cached_data.get("chat_history", [])
        rag_doc_slider = str(data.get("ragDocSlider"))
        print("-" * 10, "CHAT HISTORY", "-" * 10)
        print(chat_history)

        output_size = str(data.get("output_size"))
        show_pages_checkbox = str(data.get("show_pages_checkbox"))
        choosen_model = str(
            data.get("choosen_model", "gemini-2.0-flash")
        )  # second arg = default model
        change_length_checkbox = str(data.get("change_length_checkbox"))
        enhancer_checkbox = str(data.get("prompt_enhancer"))
        slider_value = data.get("slider_value")

        if slider_value is not None:
            slider_value = float(slider_value)
        else:
            slider_value = 0.0

        if not prompt:
            log.error("No prompt provided by user")
            socketio.emit("error", {"message": "No input provided"})
            streaming = False
            socketio.emit("stream_stopped")
            return

        if not content_id:
            log.error(
                f"Content ID missing or cached data not found for ID: {content_id}"
            )
            socketio.emit("error", {"message": "No content ID for the chat provided"})
            streaming = False
            socketio.emit("stream_stopped")
            return

        if not pdf_name:
            log.error(f"PDF name not found in cache for content ID: {content_id}")
            socketio.emit("error", {"message": "No pdf name provided"})
            streaming = False
            socketio.emit("stream_stopped")
            return

        if not isinstance(chat_history, list):
            log.warning(
                f"Cached data for '{content_id}' contained 'chat_history' but it was not a list:"
                f"(type: {type(chat_history)}). Initializing as empty list."
            )

        # debug logs for each document
        log.debug(f"Prompt: {prompt}")
        log.debug(f"Content id: {content_id}")
        log.debug(f"Pdf name (from cache): {pdf_name}")
        log.debug(
            f"Initial Chat History (loaded/initialized): {len(chat_history)} messages"
        )
        log.debug(f"Output size: {output_size}")
        log.debug(f"Show pages: {show_pages_checkbox}")
        log.debug(f"Change output size: {change_length_checkbox}")
        log.debug(f"Selected model: {choosen_model}")
        log.debug(f"RAG or document: {rag_doc_slider}")
        log.debug(f"Prompt enhancer: {enhancer_checkbox}")

        # model instance inside the function to allow multiple models
        genai.configure(api_key=GEMINI_API_KEY)
        model = genai.GenerativeModel(
            choosen_model,
            system_instruction=show_pages(SYSTEM_PROMPT, show_pages_checkbox),
        )  # another models to be used: "gemini-2.0-flash-thinking-exp-01-21" "gemini-2.0-flash"

        try:
            if not streaming:
                socketio.emit("stream_stopped")
                log.info("Stream stopped before file processing.")
            pdf_name_to_show = pdf_name

            collection_name = cached_data.get("collection_name")
            if not collection_name:
                # fallback for old cache entries
                collection_name = generate_vector_db_document_name(
                    pdf_name, max_length=CHROMADB_MAX_FILENAME_LENGTH
                )

            # print("-" * 10, "COLLECTION NAME HANDLING CHAT MESSAGE", "-" * 10)
            # print(collection_name)

            accumulated_text = ""
            for result_chunk in process_chat_query_with_rag(
                prompt,
                chat_history,
                pdf_name_to_show,
                model,
                change_length_checkbox,
                enhancer_checkbox,
                output_size,
                slider_value,
                chroma_client,
                collection_name,
                rag_doc_slider,
            ):
                if not streaming:
                    log.info("Stopping chat processing due to streaming flag.")
                    break
                # Check the structure of the yielded chunk
                if "content" in result_chunk:
                    log.debug(f'Recived response chunk: {result_chunk["content"]}')
                    accumulated_text += result_chunk["content"]
                    chunk_text = result_chunk["content"]
                    socketio.emit("receive_chat_message", {"message": chunk_text})

                elif "error" in result_chunk:
                    error_message = result_chunk["error"]
                    log.error(
                        f"Error chunk from process_query_with_rag: {error_message}"
                    )
                    # --- Emit an error message to the frontend ---
                    socketio.emit("receive_chat_message", {"error": error_message})
                    # If an error occurs in a chunk, stop processing the rest of the stream
                    break
                else:
                    socketio.emit("error", {"message": "unexpected error"})
                    return
            if streaming:
                chat_history.append({"role": "user", "parts": [prompt]})
                chat_history.append({"role": "model", "parts": [accumulated_text]})
                cached_data["chat_history"] = (
                    chat_history  # <-- Assign the updated list back into the dictionary
                )
                cache.set(content_id, cached_data, timeout=3600)
                log.info(f"Stored updated data for {content_id} in cache.")

            if not streaming:
                socketio.emit("stream_stopped")
                log.info("Stream stopped during request processing.")

        except Exception as e:
            log.error(f"An error occurred in the generate function: {e}")
            traceback.print_exc()
            socketio.emit(
                "error", {"message": f"An unexpected error occurred: {str(e)}"}
            )

        streaming = False
        socketio.emit("stream_stopped")

    except Exception as e:
        log.error(f"An error occurred in the generate function: {e}")
        traceback.print_exc()
        socketio.emit("error", {"message": f"An unexpected error occurred: {str(e)}"})
        streaming = False

handle_clear_cache()

Clears all cached chat instances (UUIDs) created by the current client's session. Triggered by a button press on the main page.

Source code in src/main.py
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
@socketio.on("clear_cache")
def handle_clear_cache() -> None:
    """
    Clears all cached chat instances (UUIDs) created by the current client's session.
    Triggered by a button press on the main page.
    """
    global output_index
    sid = request.sid  # type: ignore[attr-defined]
    session_map_key = f"session_map_{sid}"
    session_content_ids = cache.get(session_map_key)

    if session_content_ids:
        log.info(f"Clear event for sid: {sid}. Clearing session's cached entries.")
        for container_id in session_content_ids:
            if cache.delete(container_id):
                log.info(f"Deleted cache for key: {container_id}")
        cache.delete(session_map_key)
        log.info(f"Deleted session map for sid: {sid}")
    else:
        log.info(f"Clear event for sid: {sid}. No session map found to clear.")

    output_index = -1
    log.info(f"Output index reset for sid: {sid}")

handle_disconnect()

Handles cache cleanup for all UUIDs created by a client's session.

Source code in src/main.py
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
@socketio.on("disconnect")
def handle_disconnect() -> None:
    """Handles cache cleanup for all UUIDs created by a client's session."""
    sid = request.sid  # type: ignore[attr-defined]
    session_map_key = f"session_map_{sid}"
    session_content_ids = cache.get(session_map_key)

    if session_content_ids:
        log.info(f"Disconnect event for sid: {sid}. Cleaning up cached entries.")
        for container_id in session_content_ids:
            if cache.delete(container_id):
                log.info(f"Deleted cache for key: {container_id}")
            else:
                log.warning(
                    f"Attempted to delete non-existent cache key: {container_id}"
                )

        cache.delete(session_map_key)
        log.info(f"Deleted session map for sid: {sid}")
    else:
        log.info(
            f"Disconnect event for sid: {sid}. No session map found, no cleanup needed."
        )

handle_reset_chat_history(data)

Finds a specific chat session by its UUID and resets its history, keeping only the first two messages (initial prompt and response).

Source code in src/main.py
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
@socketio.on("reset_chat_history")
def handle_reset_chat_history(data: dict) -> None:
    """
    Finds a specific chat session by its UUID and resets its history,
    keeping only the first two messages (initial prompt and response).
    """
    content_id = data.get("contentId")
    if not content_id:
        log.warning("Received reset_chat_history event without a contentId.")
        return

    log.info(f"Resetting chat history for UUID: {content_id}")

    cached_data = cache.get(content_id)

    if cached_data and "chat_history" in cached_data:
        cached_data["chat_history"] = cached_data["chat_history"][:2]
        cache.set(content_id, cached_data, timeout=3600)
        log.info(f"Successfully reset history for UUID: {content_id}")
        socketio.emit("history_reset_success", {"contentId": content_id})
    else:
        log.warning(f"Could not find data to reset for UUID: {content_id}")

handle_stop()

Stops the current processing stream when triggered by the client.

This function sets the global streaming flag to False, effectively stopping any ongoing data generation or response processing.

Examples:

None

Returns:

Type Description
None

None

Source code in src/main.py
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
@socketio.on("stop_processing")
def handle_stop() -> None:
    """Stops the current processing stream when triggered by the client.

    This function sets the global streaming flag to False, effectively stopping
    any ongoing data generation or response processing.

    Examples:
        None

    Returns:
        None

    Raises:
        None
    """

    global streaming
    streaming = False
    log.info("Processing Stopped by User")

index()

Serves the main page of the application with a list of available PDF files.

This route handler is mapped to the root URL ("/"). When accessed, it logs that the application is running, scans the designated directory for PDF files, and renders the homepage template with those files listed.

This enables users to see which documents are available for analysis.

Examples:

None

Returns:

Type Description
str

A rendered HTML page (index.html) with the following context:

str
  • pdf_files: A list of available PDF filenames in the scraped directory.
Source code in src/main.py
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
@app.route("/")
def index() -> str:
    """Serves the main page of the application with a list of available PDF files.

    This route handler is mapped to the root URL ("/"). When accessed, it logs
    that the application is running, scans the designated directory for PDF files,
    and renders the homepage template with those files listed.

    This enables users to see which documents are available for analysis.

    Examples:
        None

    Returns:
        A rendered HTML page (index.html) with the following context:
        - pdf_files: A list of available PDF filenames in the scraped directory.

    Raises:
        None directly
    """

    log.info("App is up")
    pdf_dir = Path(SCRAPED_FILES_DIR)
    pdf_files = [pdf.name for pdf in pdf_dir.glob("*.pdf")] if pdf_dir.exists() else []
    pdf_files = sorted(pdf_files, key=lambda x: extract_title_from_filename(x).lower())
    pdf_titles = {pdf: extract_title_from_filename(pdf) for pdf in pdf_files}
    return render_template("index.html", pdf_files=pdf_files, pdf_titles=pdf_titles)

process_text(data)

Handles the initial processing of user input and selected PDFs using a generative model.

This function is triggered via a Socket.IO event when a user initiates processing. It validates the input prompt and selected PDF files, sets up the selected Gemini model, and processes each PDF using retrieval-augmented generation (RAG). The response is streamed back to the client in real time, rendered in markdown, and cached for future access.

Each processed PDF results in the creation of a content container, which is dynamically sent to the frontend. If errors occur during processing, they are logged and sent to the client as error events.

Examples:

Triggered internally by Socket.IO when the user starts processing:

>>> process_text({
        "input": "Summarize the risks mentioned",
        "pdfFiles": ["KNF_2022_01.pdf"],
        "output_size": "short",
        "show_pages_checkbox": True,
        "choosen_model": "gemini-2.0-flash",
        ...
    })

Parameters:

Name Type Description Default
data dict

A dictionary containing user input and options. Expected keys include: - "input": User’s prompt (str). - "pdfFiles": List of PDF filenames to process (List[str]). - "output_size": Approximate length of the response (str). - "show_pages_checkbox": Whether to include page numbers (bool or str). - "choosen_model": Selected Gemini model (str). - "change_length_checkbox": Whether output length can vary (bool or str). - "slider_value": Float controlling verbosity or detail (str or float). - "ragDocSlider": Toggle between RAG and document mode (str). - Other UI flags or settings.

required

Returns:

Type Description
None

None. Results are streamed to the client via Socket.IO events: - "new_container": Sends a new HTML container for each PDF. - "update_content": Streams chunks of the model's response. - "processing_complete_for_container": Signals PDF completion. - "error": Sends error messages if validation or processing fails. - "stream_stopped": Indicates the end of the streaming session.

Source code in src/main.py
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
@socketio.on("start_processing")
def process_text(data: dict) -> None:  # noqa: C901
    """Handles the initial processing of user input and selected PDFs using a generative model.

    This function is triggered via a Socket.IO event when a user initiates processing.
    It validates the input prompt and selected PDF files, sets up the selected
    Gemini model, and processes each PDF using retrieval-augmented generation (RAG).
    The response is streamed back to the client in real time, rendered in markdown,
    and cached for future access.

    Each processed PDF results in the creation of a content container, which is
    dynamically sent to the frontend. If errors occur during processing, they are
    logged and sent to the client as error events.

    Examples:
        # Triggered internally by Socket.IO when the user starts processing:
        >>> process_text({
                "input": "Summarize the risks mentioned",
                "pdfFiles": ["KNF_2022_01.pdf"],
                "output_size": "short",
                "show_pages_checkbox": True,
                "choosen_model": "gemini-2.0-flash",
                ...
            })

    Args:
        data: A dictionary containing user input and options. Expected keys include:
            - "input": User’s prompt (str).
            - "pdfFiles": List of PDF filenames to process (List[str]).
            - "output_size": Approximate length of the response (str).
            - "show_pages_checkbox": Whether to include page numbers (bool or str).
            - "choosen_model": Selected Gemini model (str).
            - "change_length_checkbox": Whether output length can vary (bool or str).
            - "slider_value": Float controlling verbosity or detail (str or float).
            - "ragDocSlider": Toggle between RAG and document mode (str).
            - Other UI flags or settings.

    Returns:
        None. Results are streamed to the client via Socket.IO events:
            - "new_container": Sends a new HTML container for each PDF.
            - "update_content": Streams chunks of the model's response.
            - "processing_complete_for_container": Signals PDF completion.
            - "error": Sends error messages if validation or processing fails.
            - "stream_stopped": Indicates the end of the streaming session.

    Raises:
        Emits error events instead of raising exceptions directly.
        Internal exceptions are caught, logged, and passed to the client as messages.
    """

    log.info("Started input processing")

    try:
        global output_index
        output_index += 1
        global streaming
        streaming = True
        sid = request.sid  # type: ignore[attr-defined]

        log.info(f"SID start_processing main page: {request.sid}")  # type: ignore[attr-defined]
        # get data
        prompt = data.get("input")
        selected_files = data.get("pdfFiles")
        output_size = data.get("output_size")
        show_pages_checkbox = str(data.get("show_pages_checkbox"))
        choosen_model = str(
            data.get("choosen_model", "gemini-2.0-flash")
        )  # second arg = default model
        change_length_checkbox = str(data.get("change_length_checkbox"))
        enhancer_checkbox = str(data.get("prompt_enhancer"))
        slider_value = data.get("slider_value")
        rag_doc_slider = str(data.get("ragDocSlider"))

        if slider_value is not None:
            slider_value = float(slider_value)
        else:
            slider_value = 0.0

        if not prompt:
            log.error("No prompt provided")
            socketio.emit("error", {"message": "No input provided"})
            streaming = False
            socketio.emit("stream_stopped")
            return

        if not selected_files or selected_files == []:
            log.error("no selected files")
            socketio.emit("error", {"message": "No files selected"})
            streaming = False
            socketio.emit("stream_stopped")
            return

        output_size = str(output_size)

        # debug logs for each document
        log.debug(f"prompt: {prompt}")
        log.debug(f"selected files: {selected_files}")
        log.debug(f"output size: {output_size}")
        log.debug(f"Show pages: {show_pages_checkbox}")
        log.debug(f"Change output size: {change_length_checkbox}")
        log.debug(f"selected_model: {choosen_model}")
        log.debug(f"Prompt enhancer: {enhancer_checkbox}")
        log.debug(f"RAG or document: {rag_doc_slider}")

        # files
        pdf_dir = Path(SCRAPED_FILES_DIR)

        pdfs_to_scan = [pdf_dir / file_name for file_name in selected_files]

        for pdf in pdfs_to_scan:
            log.info(pdf)

        # model instance inside the function to allow multiple models
        genai.configure(api_key=GEMINI_API_KEY)
        model = genai.GenerativeModel(
            choosen_model,
            system_instruction=show_pages(SYSTEM_PROMPT, show_pages_checkbox),
        )  # another models to be used: "gemini-2.0-flash-thinking-exp-01-21" "gemini-2.0-flash"

        try:
            for index, pdf in enumerate(pdfs_to_scan):
                if not streaming:
                    break
                # document title extraction
                pdf_parts = pdf.stem.split("_", 2)
                if len(pdf_parts) == 3:
                    doc_id, timestamp, title = pdf_parts
                    pdf_name_to_show = title.lstrip("_").rstrip("_")
                else:
                    pdf_name_to_show = pdf.stem.lstrip("_").rstrip("_")  # fallback

                container_id = str(uuid.uuid4())
                log.info(f"Generated unique container ID (UUID): {container_id}")

                container_html = render_template(
                    "output.html",
                    container_title=pdf_name_to_show,
                    container_id=container_id,
                )

                log.info(f"New container created for: {pdf_name_to_show}")
                socketio.emit("new_container", {"html": container_html})

                collection_name = generate_vector_db_document_name(
                    pdf.stem, max_length=CHROMADB_MAX_FILENAME_LENGTH
                )
                print("-" * 10, "COLLECTION NAME", "-" * 10)
                print(collection_name)

                accumulated_text = ""

                for result_chunk in process_query_with_rag(
                    prompt,
                    pdf_name_to_show,
                    model,
                    change_length_checkbox,
                    enhancer_checkbox,
                    output_size,
                    slider_value,
                    chroma_client,
                    collection_name,
                    rag_doc_slider,
                ):
                    if not streaming:
                        break
                    if "error" in result_chunk:
                        log.error("Error received in chunk")
                        error_message = {"message": "error in chunk response"}
                        socketio.emit("error", error_message)
                        return
                    elif "content" in result_chunk:
                        log.debug(f'Received response chunk: {result_chunk["content"]}')
                        accumulated_text += result_chunk["content"]
                        markdown_content = markdown.markdown(accumulated_text)
                        final_markdown_content = (
                            markdown_content  # Keep track of the latest full content
                        )
                        socketio.emit(
                            "update_content",
                            {
                                "container_id": container_id,
                                "html": markdown_content,
                            },
                        )
                    else:
                        socketio.emit("error", {"message": "unexpected error"})
                        return
                if streaming:
                    chat_history = [
                        {"role": "user", "parts": [prompt]},
                        {"role": "model", "parts": [accumulated_text]},
                    ]
                    data_to_cache = {
                        "title": pdf_name_to_show,
                        "content": final_markdown_content,
                        "chat_history": chat_history,
                        "collection_name": collection_name,
                    }
                    # Set a timeout (e.g., 1 hour = 3600 seconds)
                    cache.set(container_id, data_to_cache, timeout=3600)
                    log.info(f"Stored content for unique key {container_id} in cache.")
                    log.info(
                        f"Initial processing complete for container ID: {container_id}."
                        "Emitting completion signal."
                    )
                    session_map_key = f"session_map_{sid}"
                    session_content_ids = cache.get(session_map_key) or []
                    if container_id not in session_content_ids:
                        session_content_ids.append(container_id)
                        cache.set(session_map_key, session_content_ids, timeout=3600)
                        log.info(f"Added {container_id} to session map for sid: {sid}")
                    # Emit a custom event indicating completion for THIS container
                    socketio.emit(
                        "processing_complete_for_container",
                        {"container_id": container_id},
                    )
                else:
                    log.info(
                        f"Processing stopped for {pdf_name_to_show} ({container_id}). Not emitting completion signal."
                    )

            if not streaming:
                socketio.emit("stream_stopped")
                log.info("Stream stopped during file processing.")
        except Exception as e:
            log.error(f"An error occurred in the generate function: {e}")
            traceback.print_exc()
            socketio.emit(
                "error", {"message": f"An unexpected error occurred: {str(e)}"}
            )

        streaming = False
        socketio.emit("stream_stopped")

    except Exception as e:
        log.error(f"An error occurred in the generate function: {e}")
        traceback.print_exc()
        socketio.emit("error", {"message": f"An unexpected error occurred: {str(e)}"})
        streaming = False

extract_text_from_pdf(pdf_path)

Extracts text from a PDF file.

This function reads a PDF file, extracts text from each page, and concatenates the extracted text into a single string. If an error occurs during extraction, it logs the error and returns an empty string.

Examples:

>>> extract_text_from_pdf(Path("document.pdf"))
'This is the extracted text from the PDF document.'

Parameters:

Name Type Description Default
pdf_path Path

A Path object representing the PDF file to extract text from.

required

Returns:

Type Description

A string containing the extracted text from the PDF. If an error occurs,

an empty string is returned.

Raises:

Type Description
Exception

Any exceptions encountered while reading or extracting text

Source code in src/backend/extract_text.py
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
def extract_text_from_pdf(pdf_path: Path):  # type: ignore
    """Extracts text from a PDF file.

    This function reads a PDF file, extracts text from each page, and concatenates
    the extracted text into a single string. If an error occurs during extraction,
    it logs the error and returns an empty string.

    Examples:
        >>> extract_text_from_pdf(Path("document.pdf"))
        'This is the extracted text from the PDF document.'

    Args:
        pdf_path: A Path object representing the PDF file to extract text from.

    Returns:
        A string containing the extracted text from the PDF. If an error occurs,
        an empty string is returned.

    Raises:
        Exception: Any exceptions encountered while reading or extracting text
        from the PDF are logged and handled.
    """

    try:
        pages_list = []
        with pdfplumber.open(pdf_path) as pdf:
            for page in pdf.pages:
                page_text = page.extract_text()
                if page_text:  # some pages may return None
                    pages_list.append(page_text)
        return pages_list

    except Exception as e:
        log.error(f"Error processing {pdf_path}: {str(e)}")
        traceback.print_exc()
        return ""

scrape_knf(scraped_dir, num_retries, user_agent_list)

Scrapes pdf files from KNF url.

This function scrapes pdf files from a KNF url. For a certain number of tries it masks under an agent from given agent list and downloads the file into a directory. It adds a document ID If an error occurs during scraping, it logs the error message.

Examples:

>>> scrape_knf(10, ["Mozilla/5.0", "Mozilla/4.0"])
None

Parameters:

Name Type Description Default
num_retries int

An int describing number of retries the program will

required
user_agent_list list

A list of strings with user agents for masking.

required

Returns:

Type Description
None

None

Raises:

Type Description
Exception

If an error occurs during processing, it is logged and returned

Source code in src/backend/knf_scraping.py
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
def scrape_knf(scraped_dir: Path, num_retries: int, user_agent_list: list) -> None:
    """Scrapes pdf files from KNF url.

    This function scrapes pdf files from a KNF url.
    For a certain number of tries it masks under an agent from given agent list
    and downloads the file into a directory. It adds a document ID If an error occurs during scraping,
    it logs the error message.

    Examples:
        >>> scrape_knf(10, ["Mozilla/5.0", "Mozilla/4.0"])
        None

    Args:
        num_retries: An int describing number of retries the program will
        attempt of scraping a file.
        user_agent_list: A list of strings with user agents for masking.

    Returns:
        None

    Raises:
        Exception: If an error occurs during processing, it is logged and returned
        in the response dictionary.
    """

    knf_base_url = "https://www.knf.gov.pl"
    knf_recommendations_url = (
        f"{knf_base_url}/dla_rynku/regulacje_i_praktyka/rekomendacje_i"
        + "_wytyczne/rekomendacje_dla_bankow?articleId=8522&p_id=18"
    )

    scraped_dir.mkdir(parents=True, exist_ok=True)

    response = None
    for _ in range(num_retries):
        try:
            headers = {
                "User-Agent": user_agent_list[
                    random.randint(0, len(user_agent_list) - 1)
                ]
            }
            response = requests.get(knf_recommendations_url, headers=headers)
            if response.status_code in [200, 404]:
                break  # escape loop if response was successful
        except requests.exceptions.ConnectionError:
            log.error("Connection failed, retrying...")

    if response and response.status_code == 200:
        soup = BeautifulSoup(response.content, "html.parser")

        time_html_tag = soup.time
        datetime_atr = time_html_tag["datetime"]  # type: ignore[index]

        pdf_titles_links = {}
        for link in soup.find_all(
            "a", title=lambda x: x and "rekomendacja" in x.lower()
        ):
            try:
                href = link.get("href")
                title = link.get_text(strip=True)
                if href and href.endswith(".pdf"):
                    # not all .pdf are listed on knf.gov.pl
                    if "https" not in href:
                        full_url = knf_base_url + href
                        pdf_titles_links[title] = full_url
                    else:
                        pdf_titles_links[title] = href
            except Exception as e:
                log.error(f"Problem with link for {link} \n Error messange: {e}\n")

        for title, url in pdf_titles_links.items():
            if len(title) > 1:  # temporary: scraping needs deeper rework;
                try:
                    pdf_response = requests.get(url, headers=headers)
                    safe_title = windows_safe_filename(title) if title else "unknown"
                    entry = get_or_assign_id(safe_title)
                    # adding datetime from KNF site to file name
                    assert isinstance(datetime_atr, str)
                    date_str = datetime_atr[:10].replace("-", "")
                    unique_filename = f"{entry['id']}_{date_str}_{safe_title}.pdf"
                    pdf_path = scraped_dir / unique_filename
                    with open(pdf_path, "wb") as f:
                        f.write(pdf_response.content)
                        log.debug(f"Downloaded: {pdf_path}")
                except Exception as e:
                    log.error(f"PDF not downloaded: {url} \n Error messange: {e}\n")
                    traceback.print_exc()
    else:
        log.error("Failed to retrieve the main page content after retries.")

windows_safe_filename(filename)

Removes invalid characters from the file name.

This function removes any invalid character in Windows file name from the filename.
It also replaces end of line characters "

" with spaces " ". Returns the new file name.

Examples:
    >>> windows_safe_filename('invalid:filename?.txt')
    'invalidfilename.txt'

Args:
    filename: A string containing the file name.

Returns:
    A string containing file name cleansed from any invalid characters.

Raises:
    None
Source code in src/backend/knf_scraping.py
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
def windows_safe_filename(filename: str) -> str:
    """Removes invalid characters from the file name.

    This function removes any invalid character in Windows file name from the filename.
    It also replaces end of line characters "\n" with spaces " ".
    Returns the new file name.

    Examples:
        >>> windows_safe_filename('invalid:filename?.txt')
        'invalidfilename.txt'

    Args:
        filename: A string containing the file name.

    Returns:
        A string containing file name cleansed from any invalid characters.

    Raises:
        None
    """
    filename = filename.replace("\n", " ")
    filename = re.sub(
        r'[<>:"/\\|?*]', "", filename
    )  # <>:"/\|?* are invalid characters in Windows file names
    return filename

process_chat_query_with_rag(prompt, chat_history, pdf_name, model, change_length_checkbox, enhancer_checkbox, output_size, temperature_slider_value, chroma_client, collection_name, rag_doc_slider)

Processes a chat query using RAG, incorporating conversation history.

Retrieves relevant context from a document collection based on the current user prompt. It then combines the (potentially enhanced) prompt with this context and the existing chat history, queries the generative model, and streams the response.

Parameters:

Name Type Description Default
prompt str

The user's current query/message in the conversation.

required
chat_history str

A string representation of the conversation history. (Note: Assumes model.start_chat() accepts this string format).

required
pdf_name str

The identifier/name of the document (for RAG context and logging).

required
model GenerativeModel

The generative AI model instance (e.g., genai.GenerativeModel).

required
change_length_checkbox str

String flag ("True"/"False") to modify response length.

required
enhancer_checkbox str

String flag ("True"/"False") for prompt enhancement.

required
output_size str

The desired output size (e.g., number of words).

required
temperature_slider_value float

Temperature for model generation.

required
chroma_client Client

The ChromaDB client instance.

required
collection_name str

Name of the ChromaDB collection for this document.

required
rag_doc_slider str

String flag ("True" to use all chunks from the document's collection, "False" for a default number).

required

Yields:

Name Type Description
dict Any

A dictionary for each chunk of the response or for an error. For content: {"pdf_name": str, "content": str}. For error: {"error": str}.

Source code in src/backend/process_query.py
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
def process_chat_query_with_rag(
    prompt: str,
    chat_history: str,
    pdf_name: str,
    model: genai.GenerativeModel,
    change_length_checkbox: str,
    enhancer_checkbox: str,
    output_size: str,
    temperature_slider_value: float,
    chroma_client: ChromaClient,
    collection_name: str,
    rag_doc_slider: str,
) -> Any:
    """
    Processes a chat query using RAG, incorporating conversation history.

    Retrieves relevant context from a document collection based on the current
    user prompt. It then combines the (potentially enhanced) prompt with this
    context and the existing chat history, queries the generative model,
    and streams the response.

    Args:
        prompt: The user's current query/message in the conversation.
        chat_history: A string representation of the conversation history.
                      (Note: Assumes model.start_chat() accepts this string format).
        pdf_name: The identifier/name of the document (for RAG context and logging).
        model: The generative AI model instance (e.g., genai.GenerativeModel).
        change_length_checkbox: String flag ("True"/"False") to modify response length.
        enhancer_checkbox: String flag ("True"/"False") for prompt enhancement.
        output_size: The desired output size (e.g., number of words).
        temperature_slider_value: Temperature for model generation.
        chroma_client: The ChromaDB client instance.
        collection_name: Name of the ChromaDB collection for this document.
        rag_doc_slider: String flag ("True" to use all chunks from the document's
                        collection, "False" for a default number).

    Yields:
        dict: A dictionary for each chunk of the response or for an error.
              For content: `{"pdf_name": str, "content": str}`.
              For error: `{"error": str}`.
    """

    if not prompt:
        yield {"error": "No prompt provided"}
        return

    rag_context = _get_rag_context(
        prompt=prompt,
        pdf_name=pdf_name,
        chroma_client=chroma_client,
        collection_name=collection_name,
        rag_doc_slider=rag_doc_slider,
        embedding_function=get_gemini_ef(),
    )
    log.debug(f"Context for {pdf_name} (chat query):\n{rag_context}\n")

    final_llm_prompt = _build_final_llm_prompt(
        base_prompt=prompt,
        change_length_flag=change_length_checkbox,
        output_size=output_size,
        enhancer_flag=enhancer_checkbox,
        model=model,
        identifier=pdf_name,
        rag_context=rag_context,
        chat_history=chat_history,
    )

    try:
        log.info(
            f"Generating chat response for query on '{pdf_name}' with prompt: "
            f"'{final_llm_prompt[:200]}...'"
        )
        chat = model.start_chat(history=chat_history)  # type: ignore[arg-type]
        response = chat.send_message(
            [final_llm_prompt],
            stream=True,
            generation_config=genai.types.GenerationConfig(
                temperature=temperature_slider_value
            ),
        )

        for response_chunk in response:
            # replace -> sometimes double space between words occure; most likely reason: pdf formating
            response_chunk_text = response_chunk.text.replace("  ", " ")
            yield {"pdf_name": pdf_name, "content": response_chunk_text}
            time.sleep(STREAM_RESPONSE_CHUNK_DELAY_SECONDS)
        log.debug(f"Response for: {pdf_name} was saved!\n")
        time.sleep(POST_PROCESS_DELAY_SECONDS)  # lower API request rate per sec
    except Exception as e:
        log.error(f"There is a problem with {pdf_name}. \n Error message: {e}\n")
        traceback.print_exc()
        yield {"error": f"An error occurred while processing {pdf_name}: {str(e)}"}

process_pdf(prompt, pdf, model, change_length_checkbox, enhancer_checkbox, output_size, temperature_slider_value)

Uploads a PDF, processes it with a generative model, and streams content.

This function takes a PDF file and a prompt, uploads the file, and then calls the generative model to process the content based on the (potentially enhanced) prompt. It streams the model's response, yielding cleaned text chunks or an error dictionary.

Parameters:

Name Type Description Default
prompt str

The base prompt for processing the document.

required
pdf Path

A Path object representing the PDF file to be processed.

required
model GenerativeModel

The generative AI model instance (e.g., genai.GenerativeModel).

required
change_length_checkbox str

String flag ("True"/"False") to indicate if output size instruction should be added.

required
enhancer_checkbox str

String flag ("True"/"False") to indicate if the prompt should be enhanced.

required
output_size str

The desired output size (e.g., number of words).

required
temperature_slider_value float

The temperature setting for model generation.

required

Yields:

Name Type Description
dict Any

A dictionary for each chunk of the response or for an error. For content: {"pdf_name": str, "content": str} For error: {"error": str}.

Source code in src/backend/process_query.py
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
def process_pdf(
    prompt: str,
    pdf: Path,
    model: genai.GenerativeModel,
    change_length_checkbox: str,
    enhancer_checkbox: str,
    output_size: str,
    temperature_slider_value: float,
) -> Any:
    """
    Uploads a PDF, processes it with a generative model, and streams content.

    This function takes a PDF file and a prompt, uploads the file,
    and then calls the generative model to process the content based on
    the (potentially enhanced) prompt. It streams the model's response,
    yielding cleaned text chunks or an error dictionary.

    Args:
        prompt: The base prompt for processing the document.
        pdf: A Path object representing the PDF file to be processed.
        model: The generative AI model instance (e.g., genai.GenerativeModel).
        change_length_checkbox: String flag ("True"/"False") to indicate if
                                output size instruction should be added.
        enhancer_checkbox: String flag ("True"/"False") to indicate if the
                           prompt should be enhanced.
        output_size: The desired output size (e.g., number of words).
        temperature_slider_value: The temperature setting for model generation.

    Yields:
        dict: A dictionary for each chunk of the response or for an error.
              For content: `{"pdf_name": str, "content": str}`
              For error: `{"error": str}`.
    """

    if not prompt:
        yield {"error": "No prompt provided"}
        return

    try:
        log.info(f"Document: {pdf.stem} is beeing analyzed.")
        file_to_send = genai.upload_file(pdf)
        log.debug(f"PDF uploaded successfully. File metadata: {file_to_send}\n")

        final_llm_prompt_for_model = _build_final_llm_prompt(
            base_prompt=prompt,
            change_length_flag=change_length_checkbox,
            output_size=output_size,
            enhancer_flag=enhancer_checkbox,
            model=model,
            identifier=pdf.stem,
        )
        response = model.generate_content(
            [final_llm_prompt_for_model, file_to_send],
            stream=True,
            generation_config={"temperature": temperature_slider_value},
        )

        for response_chunk in response:
            # replace -> sometimes double space between words occure; most likely reason: pdf formating
            response_chunk_text = response_chunk.text.replace("  ", " ")
            yield {"pdf_name": pdf, "content": response_chunk_text}
            time.sleep(STREAM_RESPONSE_CHUNK_DELAY_SECONDS)
        log.debug(f"Response for: {pdf} was saved!\n")
        time.sleep(POST_PROCESS_DELAY_SECONDS)  # lower API request rate per sec
    except Exception as e:
        log.error(f"There is a problem with {pdf.stem}. \n Error message: {e}\n")
        traceback.print_exc()
        yield {"error": f"An error occurred while processing {pdf.stem}: {str(e)}"}

process_query_with_rag(prompt, pdf_name, model, change_length_checkbox, enhancer_checkbox, output_size, temperature_slider_value, chroma_client, collection_name, rag_doc_slider)

Processes a query using RAG, combining it with context from a document.

This function retrieves relevant context from a specified document collection (via ChromaDB) based on the user's prompt. It then combines the (potentially enhanced) prompt with this context and queries the generative model, streaming the response.

Parameters:

Name Type Description Default
prompt str

The user's base query/prompt.

required
pdf_name str

The identifier/name of the document (for RAG context and logging).

required
model GenerativeModel

The generative AI model instance (e.g., genai.GenerativeModel).

required
change_length_checkbox str

String flag ("True"/"False") to modify response length.

required
enhancer_checkbox str

String flag ("True"/"False") for prompt enhancement.

required
output_size str

The desired output size (e.g., number of words).

required
temperature_slider_value float

Temperature for model generation.

required
chroma_client Client

The ChromaDB client instance.

required
collection_name str

Name of the ChromaDB collection for this document.

required
rag_doc_slider str

String flag ("True" to use all chunks from the document's collection, "False" for a default number).

required

Yields:

Name Type Description
dict Any

A dictionary for each chunk of the response or for an error. For content: {"pdf_name": str, "content": str}. For error: {"error": str}.

Source code in src/backend/process_query.py
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
def process_query_with_rag(
    prompt: str,
    pdf_name: str,
    model: genai.GenerativeModel,
    change_length_checkbox: str,
    enhancer_checkbox: str,
    output_size: str,
    temperature_slider_value: float,
    chroma_client: ChromaClient,
    collection_name: str,
    rag_doc_slider: str,
) -> Any:
    """
    Processes a query using RAG, combining it with context from a document.

    This function retrieves relevant context from a specified document collection
    (via ChromaDB) based on the user's prompt. It then combines the
    (potentially enhanced) prompt with this context and queries the generative
    model, streaming the response.

    Args:
        prompt: The user's base query/prompt.
        pdf_name: The identifier/name of the document (for RAG context and logging).
        model: The generative AI model instance (e.g., genai.GenerativeModel).
        change_length_checkbox: String flag ("True"/"False") to modify response length.
        enhancer_checkbox: String flag ("True"/"False") for prompt enhancement.
        output_size: The desired output size (e.g., number of words).
        temperature_slider_value: Temperature for model generation.
        chroma_client: The ChromaDB client instance.
        collection_name: Name of the ChromaDB collection for this document.
        rag_doc_slider: String flag ("True" to use all chunks from the document's
                        collection, "False" for a default number).

    Yields:
        dict: A dictionary for each chunk of the response or for an error.
              For content: `{"pdf_name": str, "content": str}`.
              For error: `{"error": str}`.
    """

    if not prompt:
        yield {"error": "No prompt provided"}
        return

    rag_context = _get_rag_context(
        prompt=prompt,
        pdf_name=pdf_name,
        chroma_client=chroma_client,
        collection_name=collection_name,
        rag_doc_slider=rag_doc_slider,
        embedding_function=get_gemini_ef(),
    )
    log.debug(f"Context for {pdf_name}:\n{rag_context}\n")

    final_llm_prompt = _build_final_llm_prompt(
        base_prompt=prompt,
        change_length_flag=change_length_checkbox,
        output_size=output_size,
        enhancer_flag=enhancer_checkbox,
        model=model,
        identifier=pdf_name,
        rag_context=rag_context,
    )

    try:
        response = model.generate_content(
            [final_llm_prompt],
            stream=True,
            generation_config={"temperature": temperature_slider_value},
        )
        for response_chunk in response:
            # replace -> sometimes double space between words occure; most likely reason: pdf formating
            response_chunk_text = response_chunk.text.replace("  ", " ")
            yield {"pdf_name": pdf_name, "content": response_chunk_text}
            time.sleep(STREAM_RESPONSE_CHUNK_DELAY_SECONDS)
        log.debug(f"Response for: {pdf_name} was saved!\n")
        time.sleep(POST_PROCESS_DELAY_SECONDS)  # lower API request rate per sec
    except Exception as e:
        log.error(f"There is a problem with {pdf_name}. \n Error message: {e}\n")
        traceback.print_exc()
        yield {"error": f"An error occurred while processing {pdf_name}: {str(e)}"}

show_pages(system_prompt, show_pages_checkbox)

Add an additional string to the system prompt to instruct model to include page name in outputs.

Source code in src/backend/show_pages.py
13
14
15
16
17
18
19
20
21
22
def show_pages(system_prompt: str, show_pages_checkbox: str) -> str:
    """
    Add an additional string to the system prompt to instruct model to include page name in outputs.
    """
    if show_pages_checkbox == "True":  # request can be used only inside the function
        log.debug(system_prompt + OPTIONAL_PAGE_NUMBER_SP)
        return system_prompt + OPTIONAL_PAGE_NUMBER_SP
    else:
        log.debug(system_prompt)
        return system_prompt