lmassaron · February 26, 2025 18:07
diff --git a/personal-assistant-for-knowledge-management-based-on-gemini-on-vertex-ai.ipynb b/personal-assistant-for-knowledge-management-based-on-gemini-on-vertex-ai.ipynb
 {
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "provenance": [],
      "authorship_tag": "ABX9TyOssKpkE1nrfhh4e2f6Cpg+",
      "include_colab_link": true
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "language_info": {
      "name": "python"
    },
    "widgets": {
      "application/vnd.jupyter.widget-state+json": {
        "821b1207e4d245178d8e1802304b50f3": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HBoxModel",
          "model_module_version": "1.5.0",
          "state": {
            "_dom_classes": [],
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "HBoxModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/controls",
            "_view_module_version": "1.5.0",
            "_view_name": "HBoxView",
            "box_style": "",
            "children": [
              "IPY_MODEL_341565875e1e42f8be25ab256e8a3c6e",
              "IPY_MODEL_b7b93e025f5b48d6a9a739bc1072066f",
              "IPY_MODEL_43ee144b658b4b69aaf6b65dadac7834"
            ],
            "layout": "IPY_MODEL_a3c64db6218e4c0aaf7164d8f054ca29"
          }
        },
        "341565875e1e42f8be25ab256e8a3c6e": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HTMLModel",
          "model_module_version": "1.5.0",
          "state": {
            "_dom_classes": [],
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "HTMLModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/controls",
            "_view_module_version": "1.5.0",
            "_view_name": "HTMLView",
            "description": "",
            "description_tooltip": null,
            "layout": "IPY_MODEL_a4f747901ad24063811591dd7fb5ad39",
            "placeholder": "",
            "style": "IPY_MODEL_4974d0ade9e3446fbf77a2e2a0ba1598",
            "value": "100%"
          }
        },
        "b7b93e025f5b48d6a9a739bc1072066f": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "FloatProgressModel",
          "model_module_version": "1.5.0",
          "state": {
            "_dom_classes": [],
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "FloatProgressModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/controls",
            "_view_module_version": "1.5.0",
            "_view_name": "ProgressView",
            "bar_style": "success",
            "description": "",
            "description_tooltip": null,
            "layout": "IPY_MODEL_bb23fa422e774336ab853a4bdafea691",
            "max": 8,
            "min": 0,
            "orientation": "horizontal",
            "style": "IPY_MODEL_fc50fbcb6d474acb9cd690c30c029269",
            "value": 8
          }
        },
        "43ee144b658b4b69aaf6b65dadac7834": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HTMLModel",
          "model_module_version": "1.5.0",
          "state": {
            "_dom_classes": [],
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "HTMLModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/controls",
            "_view_module_version": "1.5.0",
            "_view_name": "HTMLView",
            "description": "",
            "description_tooltip": null,
            "layout": "IPY_MODEL_60d0fd9ef4734f8dbd61100515f9fe10",
            "placeholder": "",
            "style": "IPY_MODEL_56c8d3f56ac2460ab36c8f99de82c80d",
            "value": " 8/8 [00:30&lt;00:00,  4.85s/it]"
          }
        },
        "a3c64db6218e4c0aaf7164d8f054ca29": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_model_module": "@jupyter-widgets/base",
            "_model_module_version": "1.2.0",
            "_model_name": "LayoutModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "LayoutView",
            "align_content": null,
            "align_items": null,
            "align_self": null,
            "border": null,
            "bottom": null,
            "display": null,
            "flex": null,
            "flex_flow": null,
            "grid_area": null,
            "grid_auto_columns": null,
            "grid_auto_flow": null,
            "grid_auto_rows": null,
            "grid_column": null,
            "grid_gap": null,
            "grid_row": null,
            "grid_template_areas": null,
            "grid_template_columns": null,
            "grid_template_rows": null,
            "height": null,
            "justify_content": null,
            "justify_items": null,
            "left": null,
            "margin": null,
            "max_height": null,
            "max_width": null,
            "min_height": null,
            "min_width": null,
            "object_fit": null,
            "object_position": null,
            "order": null,
            "overflow": null,
            "overflow_x": null,
            "overflow_y": null,
            "padding": null,
            "right": null,
            "top": null,
            "visibility": null,
            "width": null
          }
        },
        "a4f747901ad24063811591dd7fb5ad39": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_model_module": "@jupyter-widgets/base",
            "_model_module_version": "1.2.0",
            "_model_name": "LayoutModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "LayoutView",
            "align_content": null,
            "align_items": null,
            "align_self": null,
            "border": null,
            "bottom": null,
            "display": null,
            "flex": null,
            "flex_flow": null,
            "grid_area": null,
            "grid_auto_columns": null,
            "grid_auto_flow": null,
            "grid_auto_rows": null,
            "grid_column": null,
            "grid_gap": null,
            "grid_row": null,
            "grid_template_areas": null,
            "grid_template_columns": null,
            "grid_template_rows": null,
            "height": null,
            "justify_content": null,
            "justify_items": null,
            "left": null,
            "margin": null,
            "max_height": null,
            "max_width": null,
            "min_height": null,
            "min_width": null,
            "object_fit": null,
            "object_position": null,
            "order": null,
            "overflow": null,
            "overflow_x": null,
            "overflow_y": null,
            "padding": null,
            "right": null,
            "top": null,
            "visibility": null,
            "width": null
          }
        },
        "4974d0ade9e3446fbf77a2e2a0ba1598": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "DescriptionStyleModel",
          "model_module_version": "1.5.0",
          "state": {
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "DescriptionStyleModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "StyleView",
            "description_width": ""
          }
        },
        "bb23fa422e774336ab853a4bdafea691": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_model_module": "@jupyter-widgets/base",
            "_model_module_version": "1.2.0",
            "_model_name": "LayoutModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "LayoutView",
            "align_content": null,
            "align_items": null,
            "align_self": null,
            "border": null,
            "bottom": null,
            "display": null,
            "flex": null,
            "flex_flow": null,
            "grid_area": null,
            "grid_auto_columns": null,
            "grid_auto_flow": null,
            "grid_auto_rows": null,
            "grid_column": null,
            "grid_gap": null,
            "grid_row": null,
            "grid_template_areas": null,
            "grid_template_columns": null,
            "grid_template_rows": null,
            "height": null,
            "justify_content": null,
            "justify_items": null,
            "left": null,
            "margin": null,
            "max_height": null,
            "max_width": null,
            "min_height": null,
            "min_width": null,
            "object_fit": null,
            "object_position": null,
            "order": null,
            "overflow": null,
            "overflow_x": null,
            "overflow_y": null,
            "padding": null,
            "right": null,
            "top": null,
            "visibility": null,
            "width": null
          }
        },
        "fc50fbcb6d474acb9cd690c30c029269": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "ProgressStyleModel",
          "model_module_version": "1.5.0",
          "state": {
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "ProgressStyleModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "StyleView",
            "bar_color": null,
            "description_width": ""
          }
        },
        "60d0fd9ef4734f8dbd61100515f9fe10": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_model_module": "@jupyter-widgets/base",
            "_model_module_version": "1.2.0",
            "_model_name": "LayoutModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "LayoutView",
            "align_content": null,
            "align_items": null,
            "align_self": null,
            "border": null,
            "bottom": null,
            "display": null,
            "flex": null,
            "flex_flow": null,
            "grid_area": null,
            "grid_auto_columns": null,
            "grid_auto_flow": null,
            "grid_auto_rows": null,
            "grid_column": null,
            "grid_gap": null,
            "grid_row": null,
            "grid_template_areas": null,
            "grid_template_columns": null,
            "grid_template_rows": null,
            "height": null,
            "justify_content": null,
            "justify_items": null,
            "left": null,
            "margin": null,
            "max_height": null,
            "max_width": null,
            "min_height": null,
            "min_width": null,
            "object_fit": null,
            "object_position": null,
            "order": null,
            "overflow": null,
            "overflow_x": null,
            "overflow_y": null,
            "padding": null,
            "right": null,
            "top": null,
            "visibility": null,
            "width": null
          }
        },
        "56c8d3f56ac2460ab36c8f99de82c80d": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "DescriptionStyleModel",
          "model_module_version": "1.5.0",
          "state": {
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "DescriptionStyleModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "StyleView",
            "description_width": ""
          }
        }
      }
    }
  },
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "view-in-github",
        "colab_type": "text"
      },
      "source": [
        "<a href=\"https://colab.research.google.com/gist/lmassaron/09cc9973735562abd6acf8afc2b7ca33/personal-assistant-for-knowledge-management-based-on-gemini-on-vertex-ai.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "## Personal Assistant for knowledge management based on Gemini 2.0 and Vertex AI\n",
        "\n",
        "---\n"
      ],
      "metadata": {
        "id": "LaAm3I-PpDce"
      }
    },
    {
      "cell_type": "markdown",
      "source": [
        "The code %pip install --upgrade --quiet google-genai is a command that installs or upgrades the Google Generative AI (Google Genai) Python package in a Jupyter notebook or Google Colab environment.\n",
        "\n",
        "The Google Genai package allows you to:\n",
        "\n",
        "* Access Google's generative AI models programmatically\n",
        "* Generate content using text prompts\n",
        "* Work with multimodal inputs (combining text, images, etc.)\n",
        "* Fine-tune and customize model behaviors for specific applications\n",
        "\n",
        "This command is typically one of the first steps when setting up a notebook to work with Google's AI models, as it ensures you have the required library installed with the latest features and bug fixes."
      ],
      "metadata": {
        "id": "X8sYzNG9d1Ti"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "%pip install --upgrade --quiet google-genai"
      ],
      "metadata": {
        "id": "MWfVsNr-pLs4"
      },
      "execution_count": 1,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "The code has to check if it is running in Google Colab and, if so, it performs Google authentication and retrieves a user-defined Google Cloud project. In particular, it gets the GOOGLE_CLOUD_PROJECT from the Google Colab secrets (I previosuly set up this values).\n",
        "\n",
        "GOOGLE_CLOUD_PROJECT is an environment variable in Google Cloud that stores the project ID of the active Google Cloud project. It is used to associate API calls and resources with a specific project.\n",
        "\n",
        "This allows Google Colab notebooks to interact with Google Cloud services, in particular Vertex AI, under the correct project. You can get your Project ID after setting up your project as a Google Cloud project and enabling it to use the Vertex AI API.\n",
        "\n",
        "You therefore need before proceeding:\n",
        "\n",
        "1. Create or Select a Google Cloud Project - and for doing so you need to go to your Google Cloud Console Select an existing project or create a new one\n",
        "2. Enable the Vertex AI API (you can use this link: https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com)\n",
        "\n",
        "If you need support in these passages you can use the guide presented at https://cloud.google.com/vertex-ai/docs/start/cloud-environment which takes you, step by step, through all the necessary passage."
      ],
      "metadata": {
        "id": "zp-o-OpyoOyb"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "import sys\n",
        "\n",
        "if \"google.colab\" in sys.modules:\n",
        "    from google.colab import auth\n",
        "    from google.colab import userdata\n",
        "\n",
        "    auth.authenticate_user()\n",
        "    GOOGLE_CLOUD_PROJECT = userdata.get('GOOGLE_CLOUD_PROJECT')"
      ],
      "metadata": {
        "id": "HaDWGXb3pQpk"
      },
      "execution_count": 2,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "The next passage will set some furthermore key configuration settings for using Google Cloud AI services. GOOGLE_CLOUD_LOCATION is set to 'us-central1', specifying the preferred region where the cloud resources will operate. You can decide for any region you prefer or commonly use. You can find a full list here: https://cloud.google.com/vertex-ai/docs/general/locations\n",
        "\n",
        "The gemini_model variable assigns \"gemini-2.0-flash-001\", indicating the specific version of Google's Gemini AI model to be used for tasks like text generation. A list of available models is present here: https://ai.google.dev/gemini-api/docs/models/gemini.\n",
        "\n",
        "Gemini 2.0 Flash is an advanced multimodal AI model optimized for speed and next-generation features across a variety of tasks. It accepts audio, images, video, and text as input and generates text (with image and audio generation coming soon). The model supports structured outputs, function calling, code execution, and search, making it versatile for AI-driven applications.\n",
        "\n",
        "With a large input token limit (1,048,576) and output token limit (8,192), it can handle complex and lengthy interactions. Features like caching, image generation, and audio generation are planned for future updates. While tuning is not supported, the model enables native tool use and will soon support a Multimodal Live API.\n",
        "\n",
        "The latest stable version, gemini-2.0-flash-001, has been updated in February 2025 and has a knowledge cutoff of August 2024. You can find further information about the model here: https://cloud.google.com/vertex-ai/generative-ai/docs/gemini-v2\n",
        "\n",
        "Lastly, embeddings is set to \"text-embedding-004\", which refers to a text embedding model for transforming text into numerical vectors, useful for NLP tasks like similarity search or retrieval-augmented generation (RAG)."
      ],
      "metadata": {
        "id": "RiEFVoRgrB6W"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "GOOGLE_CLOUD_LOCATION = 'us-central1'                # Your preferred region\n",
        "gemini_model = \"gemini-2.0-flash-001\"\n",
        "embeddings = \"text-embedding-004\""
      ],
      "metadata": {
        "id": "9qKfk0EGsC_Q"
      },
      "execution_count": 3,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "This section imports various libraries essential for handling data processing, visualization, and AI model interactions.\n",
        "\n",
        "In particular:\n",
        "\n",
        "* json: Handles JSON encoding and decoding.\n",
        "requests: Facilitates making HTTP requests, useful for fetching data from APIs.\n",
        "* BytesIO: Allows in-memory byte-stream operations, often used for handling binary data.\n",
        "* IPython.display: Imports Markdown and display to render markdown text within Jupyter notebooks.\n",
        "* sklearn.decomposition.PCA: Implements Principal Component Analysis (PCA) for dimensionality reduction in machine learning.\n",
        "* tqdm.notebook: Enables progress bars for tracking loops and processes interactively.\n",
        "\n",
        "and naturally:\n",
        "\n",
        "* genai: The main Google AI module for interacting with generative AI models.\n",
        "* google.genai.types: Imports GenerateContentConfig and Part for configuring AI model inputs and outputs."
      ],
      "metadata": {
        "id": "dEV1maVDscvY"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "import json\n",
        "import requests\n",
        "from io import BytesIO\n",
        "\n",
        "import numpy as np\n",
        "from IPython.display import Markdown, display\n",
        "from sklearn.decomposition import PCA\n",
        "from tqdm.notebook import tqdm\n",
        "\n",
        "from google import genai\n",
        "from google.genai.types import GenerateContentConfig, Part"
      ],
      "metadata": {
        "id": "rYUm-gYpsIz9"
      },
      "execution_count": 4,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "The next code snippet initializes a Google Generative AI (GenAI) client using the Google Cloud Vertex AI service, stating the project and the location."
      ],
      "metadata": {
        "id": "tgnkR2P21LJ9"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "client = genai.Client(\n",
        "    vertexai=True,\n",
        "    project=GOOGLE_CLOUD_PROJECT,\n",
        "    location=GOOGLE_CLOUD_LOCATION\n",
        ")"
      ],
      "metadata": {
        "id": "y6Y0N0_AsQAj"
      },
      "execution_count": 5,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "Before going to the core of the project, we need to define a few utility functions.\n",
        "\n",
        "The first function, create_part_from_url(url), dynamically creates a Part object from a given URL by determining its content type. It first sends a HEAD request to retrieve only the headers, checking the MIME type to classify the content. If the URL points to an MP4 video, image, or PDF, it constructs a Part object using from_uri(). If the content is HTML, it fetches the full page and creates a text-based Part instead. The function also includes error handling for network issues and unexpected failures, ensuring robustness when processing external URLs"
      ],
      "metadata": {
        "id": "i0e2UQTssYHJ"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "def create_part_from_url(url):\n",
        "    \"\"\"Creates a Part object from a URL, handling different content types.\"\"\"\n",
        "\n",
        "    try:\n",
        "        response = requests.head(url, allow_redirects=True)  # Use HEAD to get headers only\n",
        "        response.raise_for_status()  # Raise HTTPError for bad responses (4xx or 5xx)\n",
        "        mime_type = response.headers.get(\"Content-Type\", \"\")\n",
        "\n",
        "        if \"video/mp4\" in mime_type:\n",
        "            return Part.from_uri(file_uri=url, mime_type=\"video/mp4\")\n",
        "\n",
        "        elif \"image/\" in mime_type:\n",
        "            return Part.from_uri(file_uri=url, mime_type=mime_type)\n",
        "\n",
        "        elif \"application/pdf\" in mime_type or url.endswith(\".pdf\"):\n",
        "            return Part.from_uri(file_uri=url, mime_type=\"application/pdf\")\n",
        "\n",
        "        else:\n",
        "            # For HTML, fetch the content and create a Part from the text\n",
        "            response = requests.get(url, allow_redirects=True)\n",
        "            response.raise_for_status()\n",
        "            html_content = response.text\n",
        "            return Part.from_text(text=html_content)\n",
        "\n",
        "    except requests.exceptions.RequestException as e:\n",
        "        print(f\"Error fetching URL: {e}\")\n",
        "        return None\n",
        "\n",
        "    except Exception as e:\n",
        "        print(f\"An unexpected error occured: {e}\")\n",
        "        return None"
      ],
      "metadata": {
        "id": "FypbAxv-2jVF"
      },
      "execution_count": 6,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "The extract_information(url) function retrieves and summarizes key details from a given URL using a Google Generative AI model. It first attempts to fetch the content using create_part_from_url(url). If content is available, it prompts the AI model with a structured request (PROMPT_WITH_CONTENT) to extract the title, URL, summary, and relevant tags. If no content is available, it uses a fallback prompt (PROMPT_WITHOUT_CONTENT) to make an educated guess based on the URL alone. The function then processes the AI-generated response, parses the JSON output, and extracts the required details. It includes error handling to catch unexpected failures, ensuring robustness. If extraction fails, it returns None values."
      ],
      "metadata": {
        "id": "8qy67Te81zyO"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "def extract_information(url):\n",
        "    \"\"\"Extract information from a URL including summary and tags\"\"\"\n",
        "\n",
        "    # Define prompts for content extraction\n",
        "    PROMPT_WITH_CONTENT = \"\"\"Summarize the content in English and from it extract the following information:\n",
        "    title, url, summary of the relevant content (max 256 characters), tags (max 5) describing the content.\n",
        "    Return the data in JSON format: {\"title\":\"\", \"url\": \"\", \"summary\": \"\", \"tags\": []}\"\"\"\n",
        "\n",
        "    PROMPT_WITHOUT_CONTENT = \"\"\"Make an educated guess about the contents of the provided URL and from derive the following information:\n",
        "    title, url, summary of the relevant content (max 256 characters), tags (max 5) describing the content.\n",
        "    Return the data in JSON format: {\"title\":\"\", \"url\": \"\", \"summary\": \"\", \"tags\": []}\"\"\"\n",
        "\n",
        "    try:\n",
        "        # Attempt to get content from URL\n",
        "        internet_content = create_part_from_url(url)\n",
        "\n",
        "        # Generate content based on whether we have internet content or not\n",
        "        if internet_content:\n",
        "            response_data = client.models.generate_content(\n",
        "                model=gemini_model,\n",
        "                contents=[internet_content, PROMPT_WITH_CONTENT],\n",
        "                config=GenerateContentConfig(temperature=0.0)\n",
        "            )\n",
        "        else:\n",
        "            response_data = client.models.generate_content(\n",
        "                model=gemini_model,\n",
        "                contents=[url, PROMPT_WITHOUT_CONTENT],\n",
        "                config=GenerateContentConfig(temperature=0.0)\n",
        "            )\n",
        "\n",
        "        # Process the response\n",
        "        if response_data.candidates and response_data.candidates[0].content.parts:\n",
        "            text_part = response_data.candidates[0].content.parts[0].text\n",
        "\n",
        "            # Clean up JSON string by removing markdown code block markers\n",
        "            json_string = text_part.replace('```json\\n', '').replace('\\n```', '')\n",
        "\n",
        "            # Parse JSON and extract data\n",
        "            data = json.loads(json_string)\n",
        "            return data[\"title\"], data[\"url\"], data[\"summary\"], data[\"tags\"]\n",
        "\n",
        "    except Exception as e:\n",
        "        print(f\"Error extracting information from {url}: {str(e)}\")\n",
        "\n",
        "    # Return None values if any part of the process fails\n",
        "    return None, None, None, None\n",
        "\n"
      ],
      "metadata": {
        "id": "iztP8gHS-rj3"
      },
      "execution_count": 7,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "The generate_bookmark_markdown(bookmarks, output_file=\"bookmarks.md\") function processes a list of bookmarks and generates a Markdown file with formatted entries. Each bookmark, which includes a URL, title, summary, tags, and other data, is formatted into a structured markdown format with appropriate headings, links, and metadata. Tags are converted into a comma-separated list with hashtags, and a horizontal rule (---) is added between entries for better readability. The function writes all entries into a specified output file (default is \"bookmarks.md\") and notifies the user upon successful completion. This provides an easy-to-read, organized collection of bookmarks in markdown format."
      ],
      "metadata": {
        "id": "0fS3ncnY2GCH"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "def generate_bookmark_markdown(bookmarks, output_file=\"bookmarks.md\"):\n",
        "    \"\"\"Process bookmarks and generate a markdown file with formatted entries\"\"\"\n",
        "\n",
        "    # Open the file for writing\n",
        "    with open(output_file, \"w\", encoding=\"utf-8\") as f:\n",
        "        # Write header\n",
        "        f.write(\"# Bookmarks Collection\\n\\n\")\n",
        "\n",
        "        # Process each bookmark\n",
        "        for i, bookmark in enumerate(bookmarks, 1):\n",
        "            url, extracted_url, title, summary, tags, embeds = bookmark\n",
        "\n",
        "            # Format tags as comma-separated list with hashtags\n",
        "            formatted_tags = \" \".join([f\"#{tag.strip()}\" for tag in tags]) if isinstance(tags, list) else f\"#{tags}\"\n",
        "\n",
        "            # Write bookmark entry with markdown formatting\n",
        "            f.write(f\"### {i}. [{extracted_url}]({url})\\n\\n\")\n",
        "            f.write(f\"**Title:** {title}\\n\\n\")\n",
        "            f.write(f\"**Summary:** {summary}\\n\\n\")\n",
        "            f.write(f\"**Tags:** {formatted_tags}\\n\\n\")\n",
        "\n",
        "            # Add a horizontal rule between entries (except after the last one)\n",
        "            if i < len(bookmarks):\n",
        "                f.write(\"---\\n\\n\")\n",
        "\n",
        "    print(f\"Successfully generated markdown file: {output_file}\")\n",
        "    return output_file"
      ],
      "metadata": {
        "id": "esv5W8GTsfS2"
      },
      "execution_count": 8,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "Having defined the utility functions, we can now start processing a first list of URLs. In this case, the URLs are provided in a Python list, but they could also be derived from other sources. For example, they could be extracted from a text file containing URLs you have recently visited, or from a curated list of URLs that piqued your interest and that you plan to explore further in the future."
      ],
      "metadata": {
        "id": "imN2LNo72LZN"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "url_list = [\n",
        "    \"https://corporates.db.com/files/documents/publications/whitepaper-Adopting-Generative-AI-in-Banking.pdf\",\n",
        "    \"https://newsletter.victordibia.com/p/you-have-ai-fatigue-thats-why-you\",\n",
        "    \"https://arxiv.org/abs/2207.01848\",\n",
        "    \"https://arxiv.org/abs/2501.02945\",\n",
        "    \"https://www.nature.com/articles/s41586-024-08328-6\",\n",
        "    \"https://www.youtube.com/watch?v=Ilg3gGewQ5U\",\n",
        "    \"https://www.youtube.com/watch?v=wjZofJX0v4M&t=727s\",\n",
        "    \"https://unchartedterritories.tomaspueyo.com/p/why-japan-succeeds-despite-stagnation\",\n",
        "]"
      ],
      "metadata": {
        "id": "QxgHSniesrk8"
      },
      "execution_count": 9,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "The first part of our assistant processes a list of URLs and extracts key information from each: the effective URL, the page title (or an appropriate alternative), a brief summary of the content for reference, and a set of tags that can later help in clustering the topics.\n",
        "\n",
        "The code iterates through the url_list, using the extract_information() function to retrieve the title, extracted URL, summary, and tags for each URL. It then generates embeddings for the tags using the embed_content() method from the AI model. These embeddings, along with the URL, extracted URL, title, summary, and tags, are stored in the bookmarks list. This results in a collection of enriched bookmarks containing both textual information and embeddings, which can be used for tasks such as search or similarity comparison."
      ],
      "metadata": {
        "id": "vkgm7sgI2zHr"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "bookmarks =[]\n",
        "for url in tqdm(url_list):\n",
        "  if url:\n",
        "    title, extracted_url, summary, tags = extract_information(url)\n",
        "    if title:\n",
        "        embeds = client.models.embed_content(\n",
        "            model=embeddings,\n",
        "            contents=\", \".join(tags)\n",
        "            ).embeddings[0].values\n",
        "        bookmarks.append([url, extracted_url, title, summary, tags, embeds])"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 87,
          "referenced_widgets": [
            "821b1207e4d245178d8e1802304b50f3",
            "341565875e1e42f8be25ab256e8a3c6e",
            "b7b93e025f5b48d6a9a739bc1072066f",
            "43ee144b658b4b69aaf6b65dadac7834",
            "a3c64db6218e4c0aaf7164d8f054ca29",
            "a4f747901ad24063811591dd7fb5ad39",
            "4974d0ade9e3446fbf77a2e2a0ba1598",
            "bb23fa422e774336ab853a4bdafea691",
            "fc50fbcb6d474acb9cd690c30c029269",
            "60d0fd9ef4734f8dbd61100515f9fe10",
            "56c8d3f56ac2460ab36c8f99de82c80d"
          ]
        },
        "id": "faAUJj-tyLQV",
        "outputId": "11c8c392-335e-4e7a-a2ae-1698ac4d3cb1"
      },
      "execution_count": 10,
      "outputs": [
        {
          "output_type": "display_data",
          "data": {
            "text/plain": [
              "  0%|          | 0/8 [00:00<?, ?it/s]"
            ],
            "application/vnd.jupyter.widget-view+json": {
              "version_major": 2,
              "version_minor": 0,
              "model_id": "821b1207e4d245178d8e1802304b50f3"
            }
          },
          "metadata": {}
        },
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Error fetching URL: 403 Client Error: Forbidden for url: https://corporates.db.com/files/documents/publications/whitepaper-Adopting-Generative-AI-in-Banking.pdf\n"
          ]
        }
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "Since the list of bookmarks is expected to be unordered, we aim to initially order the topics by leveraging their embeddings.\n",
        "\n",
        "The code processes the bookmarks by extracting their embeddings and applying Principal Component Analysis (PCA) to reduce the dimensionality of these embeddings. First, it collects all the embeddings into a list and converts this list into a NumPy array. The embeddings are then passed through PCA, where the data is reduced to a single component. The resulting PCA values are sorted, and the bookmarks are re-ordered based on this sorted order of embeddings. The final output is a list of sorted bookmarks, where the embeddings are ranked according to their PCA scores, likely indicating their similarity or relevance to each other.\n",
        "\n",
        "PCA identifies the principal components that capture the most variance in the data. By reducing the embeddings to a single component (the first dimension), we capture the most significant underlying structure or pattern in the data. Sorting the embeddings based on this first dimension allows us to cluster or rank the bookmarks according to the most prominent features of their content, which is particularly useful when organizing unordered or unstructured data like bookmarks. This technique helps in finding patterns and relationships that may not be immediately obvious."
      ],
      "metadata": {
        "id": "DSwnG2eP3agQ"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "embeddings_array = np.array([bookmark[5] for bookmark in bookmarks])\n",
        "\n",
        "pca = PCA(n_components=1)\n",
        "pca_result = pca.fit_transform(embeddings_array)\n",
        "\n",
        "sorted_indices = np.argsort(pca_result.flatten())\n",
        "\n",
        "sorted_bookmarks = [bookmarks[int(idx)] for idx in sorted_indices]\n"
      ],
      "metadata": {
        "id": "u1BcAyAhXg7X"
      },
      "execution_count": 11,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "To work with the processed data later, we store everything on disk as a JSON file."
      ],
      "metadata": {
        "id": "PXT0sB2L4eD6"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "with open(\"bookmarks.json\", \"w\", encoding=\"utf-8\") as f:\n",
        "    json.dump(bookmarks, f, ensure_ascii=False, indent=4)"
      ],
      "metadata": {
        "id": "hC6UNYME4aGP"
      },
      "execution_count": 12,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "Now, we can generate and display a markdown document presenting all the ordered URLs and the data we extracted using Gemini 2.0 Flash."
      ],
      "metadata": {
        "id": "3D-uRNoJ4gEn"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "generate_bookmark_markdown(sorted_bookmarks)"
      ],
      "metadata": {
        "id": "PS9w9dx1PTEf",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 53
        },
        "outputId": "6afab6bf-05e8-4bbf-db2b-4919a2ef35ce"
      },
      "execution_count": 13,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Successfully generated markdown file: bookmarks.md\n"
          ]
        },
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "'bookmarks.md'"
            ],
            "application/vnd.google.colaboratory.intrinsic+json": {
              "type": "string"
            }
          },
          "metadata": {},
          "execution_count": 14
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "display(Markdown(filename='bookmarks.md'))"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 1000
        },
        "id": "eUQci-HPPqVh",
        "outputId": "14e49b3d-893a-447c-a941-4f4eef15b89f"
      },
      "execution_count": 14,
      "outputs": [
        {
          "output_type": "display_data",
          "data": {
            "text/plain": [
              "<IPython.core.display.Markdown object>"
            ],
            "text/markdown": "# Bookmarks Collection\n\n### 1. [https://unchartedterritories.tomaspueyo.com/p/why-japan-succeeds-despite-stagnation](https://unchartedterritories.tomaspueyo.com/p/why-japan-succeeds-despite-stagnation)\n\n**Title:** Why Japan Succeeds Despite Stagnation\n\n**Summary:** Despite decades of stagnation, Japan maintains a high quality of life due to affordable housing, social order, and capital. However, demographic decline and zombie firms pose challenges.\n\n**Tags:** #Japan #Economics #Demographics #Stagnation #Housing\n\n---\n\n### 2. [https://corporates.db.com/files/documents/publications/whitepaper-Adopting-Generative-AI-in-Banking.pdf](https://corporates.db.com/files/documents/publications/whitepaper-Adopting-Generative-AI-in-Banking.pdf)\n\n**Title:** Adopting Generative AI in Banking\n\n**Summary:** This Deutsche Bank whitepaper likely explores the potential applications, challenges, and strategic considerations for banks adopting generative AI technologies. It probably covers use cases, risks, and implementation strategies.\n\n**Tags:** #Generative AI #Banking #AI Adoption #Financial Services #Technology\n\n---\n\n### 3. [https://newsletter.victordibia.com/p/you-have-ai-fatigue-thats-why-you](https://newsletter.victordibia.com/p/you-have-ai-fatigue-thats-why-you)\n\n**Title:** You have 'AI Fatigue' - Thats Why You Feel Awful\n\n**Summary:** AI fatigue is the exhaustion from the unrelenting pace of AI advancement. It impacts researchers, engineers, and organizations, disrupting cycles and creating psychological strain.\n\n**Tags:** #AI #artificial intelligence #AI Fatigue #burnout #mental health\n\n---\n\n### 4. [https://www.youtube.com/watch?v=Ilg3gGewQ5U](https://www.youtube.com/watch?v=Ilg3gGewQ5U)\n\n**Title:** Backpropagation, step-by-step | DL3 - YouTube\n\n**Summary:** A visual explanation of backpropagation in neural networks, showing how they learn through iterative adjustments. Covers gradient descent and related concepts.\n\n**Tags:** #neural networks #backpropagation #deep learning #machine learning #visualization\n\n---\n\n### 5. [https://www.youtube.com/watch?v=wjZofJX0v4M](https://www.youtube.com/watch?v=wjZofJX0v4M&t=727s)\n\n**Title:** Transformers (how LLMs work) explained visually | DL5 - YouTube\n\n**Summary:** A visual explanation of how Large Language Models (LLMs) work, covering prediction, sampling, transformers, word embeddings, and more.\n\n**Tags:** #LLM #Transformers #AI #Deep Learning #Visualization\n\n---\n\n### 6. [https://arxiv.org/abs/2207.01848](https://arxiv.org/abs/2207.01848)\n\n**Title:** TabPFN: A Transformer That Solves Small Tabular Classification Problems in a Second\n\n**Summary:** TabPFN, a trained Transformer, performs supervised classification for small tabular datasets in under a second, without hyperparameter tuning, competitive with state-of-the-art methods.\n\n**Tags:** #Transformer #Tabular Data #Classification #In-Context Learning #AutoML\n\n---\n\n### 7. [https://arxiv.org/abs/2501.02945v2](https://arxiv.org/abs/2501.02945)\n\n**Title:** The Tabular Foundation Model TabPFN Outperforms Specialized Time Series Forecasting Models Based on Simple Features\n\n**Summary:** TabPFN-TS, a simple approach pairing TabPFN with feature engineering, outperforms specialized time series models, even matching larger models, while relying solely on artificial pre-training data.\n\n**Tags:** #time series #forecasting #TabPFN #foundation model #machine learning\n\n---\n\n### 8. [https://www.nature.com/articles/s41586-024-08328-6](https://www.nature.com/articles/s41586-024-08328-6)\n\n**Title:** Accurate predictions on small data with a tabular foundation model | Nature\n\n**Summary:** TabPFN, a tabular foundation model, outperforms previous methods on datasets with up to 10,000 samples. It enables fine-tuning, data generation, and density estimation, accelerating scientific discovery.\n\n**Tags:** #tabular data #foundation model #machine learning #AI #TabPFN\n\n"
          },
          "metadata": {}
        }
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "Google Cloud credits are provided for this project.\n",
        "\n",
        "Author: Luca Massaron, AI and Kaggle GDE\n",
        "\n",
        "February 26, 2025"
      ],
      "metadata": {
        "id": "mAMFPjWN5NYE"
      }
    }
  ]
 }
No results found