Created
August 17, 2025 14:39
-
-
Save tomrockdsouza/a9dec265a9ba174015f914e621719c8f to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "nbformat": 4, | |
| "nbformat_minor": 0, | |
| "metadata": { | |
| "colab": { | |
| "provenance": [] | |
| }, | |
| "kernelspec": { | |
| "name": "python3", | |
| "display_name": "Python 3" | |
| }, | |
| "language_info": { | |
| "name": "python" | |
| } | |
| }, | |
| "cells": [ | |
| { | |
| "cell_type": "markdown", | |
| "source": [ | |
| "DOWNLOADING AND PREPARING FILE" | |
| ], | |
| "metadata": { | |
| "id": "0ojGROaZQx-2" | |
| } | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 1, | |
| "metadata": { | |
| "colab": { | |
| "base_uri": "https://localhost:8080/" | |
| }, | |
| "id": "u0DW-n_c_BfO", | |
| "outputId": "d5a21e78-78e9-4126-c714-be05fc91b1f7" | |
| }, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "name": "stdout", | |
| "text": [ | |
| "--2025-08-17 14:37:19-- https://www.kaggle.com/api/v1/datasets/download/eldrich/hate-speech-offensive-tweets-by-davidson-et-al\n", | |
| "Resolving www.kaggle.com (www.kaggle.com)... 35.244.233.98\n", | |
| "Connecting to www.kaggle.com (www.kaggle.com)|35.244.233.98|:443... connected.\n", | |
| "HTTP request sent, awaiting response... 302 Found\n", | |
| "Location: https://storage.googleapis.com:443/kaggle-data-sets/524206/962129/bundle/archive.zip?X-Goog-Algorithm=GOOG4-RSA-SHA256&X-Goog-Credential=gcp-kaggle-com%40kaggle-161607.iam.gserviceaccount.com%2F20250817%2Fauto%2Fstorage%2Fgoog4_request&X-Goog-Date=20250817T143719Z&X-Goog-Expires=259200&X-Goog-SignedHeaders=host&X-Goog-Signature=420c58846f9227eda4b59ef8b905ee9b08794a7ddf759f39985652f590036a50f3bcf8c16259a951f6e217c7b126efb15c2de54a5fc5316702a37f3c7c60a2f8df77792d8c119fc81aab0746ce2fe1d5ef1e2ec5ff090b101009b13fcc2fb026a077097f9cf238a3148f798bf8df40452726ebab173a91bad4562b8cd7d2f028b003b277f71c38467eb976827c2894bd68bbfa0de2cc9d15111888e16a32a57e7b0ea916f992232ab6481e38d68006d02eb069e752a5038820e77f52e7662bb1afe340b5aad9e0c49b9d20d570506b89e5e5fb5b02c3d329bb6e3f331bf35b7d661d3cc763888525fb9ada39016a500589b1eaa6a2158911fedeb3fbbdf24236 [following]\n", | |
| "--2025-08-17 14:37:19-- https://storage.googleapis.com/kaggle-data-sets/524206/962129/bundle/archive.zip?X-Goog-Algorithm=GOOG4-RSA-SHA256&X-Goog-Credential=gcp-kaggle-com%40kaggle-161607.iam.gserviceaccount.com%2F20250817%2Fauto%2Fstorage%2Fgoog4_request&X-Goog-Date=20250817T143719Z&X-Goog-Expires=259200&X-Goog-SignedHeaders=host&X-Goog-Signature=420c58846f9227eda4b59ef8b905ee9b08794a7ddf759f39985652f590036a50f3bcf8c16259a951f6e217c7b126efb15c2de54a5fc5316702a37f3c7c60a2f8df77792d8c119fc81aab0746ce2fe1d5ef1e2ec5ff090b101009b13fcc2fb026a077097f9cf238a3148f798bf8df40452726ebab173a91bad4562b8cd7d2f028b003b277f71c38467eb976827c2894bd68bbfa0de2cc9d15111888e16a32a57e7b0ea916f992232ab6481e38d68006d02eb069e752a5038820e77f52e7662bb1afe340b5aad9e0c49b9d20d570506b89e5e5fb5b02c3d329bb6e3f331bf35b7d661d3cc763888525fb9ada39016a500589b1eaa6a2158911fedeb3fbbdf24236\n", | |
| "Resolving storage.googleapis.com (storage.googleapis.com)... 74.125.139.207, 173.194.210.207, 173.194.212.207, ...\n", | |
| "Connecting to storage.googleapis.com (storage.googleapis.com)|74.125.139.207|:443... connected.\n", | |
| "HTTP request sent, awaiting response... 200 OK\n", | |
| "Length: 2198119 (2.1M) [application/zip]\n", | |
| "Saving to: ‘archive.zip’\n", | |
| "\n", | |
| "archive.zip 100%[===================>] 2.10M --.-KB/s in 0.02s \n", | |
| "\n", | |
| "2025-08-17 14:37:19 (95.3 MB/s) - ‘archive.zip’ saved [2198119/2198119]\n", | |
| "\n", | |
| "Archive: archive.zip\n", | |
| " inflating: data/labeled_data.csv \n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "!wget -O archive.zip https://www.kaggle.com/api/v1/datasets/download/eldrich/hate-speech-offensive-tweets-by-davidson-et-al\n", | |
| "!unzip -o archive.zip data/labeled_data.csv\n", | |
| "!mv data/labeled_data.csv labeled_data.csv\n", | |
| "!rm archive.zip\n", | |
| "!rm -rf data" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "source": [ | |
| "IMPORTING LIBRARIES" | |
| ], | |
| "metadata": { | |
| "id": "4CLWZPbzQtcm" | |
| } | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "!pip install nltk\n", | |
| "import pandas as pd\n", | |
| "import numpy as np\n", | |
| "from datetime import datetime, timedelta\n", | |
| "import random\n", | |
| "import re\n", | |
| "import nltk\n", | |
| "from nltk.corpus import stopwords\n", | |
| "from nltk.stem import WordNetLemmatizer\n", | |
| "\n", | |
| "nltk.download('punkt_tab')\n", | |
| "nltk.download('stopwords')\n", | |
| "nltk.download('wordnet')\n", | |
| "\n" | |
| ], | |
| "metadata": { | |
| "colab": { | |
| "base_uri": "https://localhost:8080/" | |
| }, | |
| "id": "ZbOZeA5_BZCi", | |
| "outputId": "a29eb404-0157-4d40-ffeb-f4bd59e5fdd8" | |
| }, | |
| "execution_count": 2, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "name": "stdout", | |
| "text": [ | |
| "Requirement already satisfied: nltk in /usr/local/lib/python3.11/dist-packages (3.9.1)\n", | |
| "Requirement already satisfied: click in /usr/local/lib/python3.11/dist-packages (from nltk) (8.2.1)\n", | |
| "Requirement already satisfied: joblib in /usr/local/lib/python3.11/dist-packages (from nltk) (1.5.1)\n", | |
| "Requirement already satisfied: regex>=2021.8.3 in /usr/local/lib/python3.11/dist-packages (from nltk) (2024.11.6)\n", | |
| "Requirement already satisfied: tqdm in /usr/local/lib/python3.11/dist-packages (from nltk) (4.67.1)\n" | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "name": "stderr", | |
| "text": [ | |
| "[nltk_data] Downloading package punkt_tab to /root/nltk_data...\n", | |
| "[nltk_data] Unzipping tokenizers/punkt_tab.zip.\n", | |
| "[nltk_data] Downloading package stopwords to /root/nltk_data...\n", | |
| "[nltk_data] Unzipping corpora/stopwords.zip.\n", | |
| "[nltk_data] Downloading package wordnet to /root/nltk_data...\n" | |
| ] | |
| }, | |
| { | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/plain": [ | |
| "True" | |
| ] | |
| }, | |
| "metadata": {}, | |
| "execution_count": 2 | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "source": [ | |
| "USING REQUIRED COLUMNS" | |
| ], | |
| "metadata": { | |
| "id": "oKhoXmAiQHOl" | |
| } | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "df=pd.read_csv('labeled_data.csv')\n", | |
| "df[\"label\"]=df[\"class\"].apply(lambda x: 1 if x==0 else 0)\n", | |
| "df=df[['tweet','label']]\n", | |
| "df.rename(columns={\"tweet\":\"comment_text\"},inplace=True)" | |
| ], | |
| "metadata": { | |
| "id": "vDe5BHZeGYRD" | |
| }, | |
| "execution_count": 3, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "source": [ | |
| "GENERATING synthetic DATA" | |
| ], | |
| "metadata": { | |
| "id": "4Wvf1ptkP6a3" | |
| } | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "cities = [\"New York, USA\", \"London, UK\", \"Berlin, Germany\", \"Tokyo, Japan\", \"Belgrade, Serbia\"]\n", | |
| "df[\"user_age\"] = np.random.randint(13, 70, size=len(df))\n", | |
| "df[\"user_location\"] = np.random.choice(cities, size=len(df))\n", | |
| "start_date = datetime(2020, 1, 1)\n", | |
| "df[\"date_posted\"] = [ (start_date + timedelta(days=random.randint(0, 1000))).date()\n", | |
| " for _ in range(len(df)) ]" | |
| ], | |
| "metadata": { | |
| "id": "EGgCy1ZRIFL7" | |
| }, | |
| "execution_count": 4, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "source": [ | |
| "CLEANING DATA" | |
| ], | |
| "metadata": { | |
| "id": "zWzybwMqP3U_" | |
| } | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "stop_words = set(stopwords.words('english'))\n", | |
| "lemmatizer = WordNetLemmatizer()\n", | |
| "\n", | |
| "def clean_and_lemmatize(text):\n", | |
| " # Remove URLs and non-alphabetic characters\n", | |
| " text = re.sub(r\"http\\S+|[^a-zA-Z\\s]\", \" \", text.lower())\n", | |
| " tokens = nltk.word_tokenize(text)\n", | |
| " # Remove stopwords and short tokens, then lemmatize\n", | |
| " tokens = [lemmatizer.lemmatize(tok) for tok in tokens if tok not in stop_words and len(tok) > 1]\n", | |
| " return \" \".join(tokens)\n", | |
| "\n", | |
| "df[\"comment_text\"] = df[\"comment_text\"].apply(clean_and_lemmatize)\n", | |
| "df.head()" | |
| ], | |
| "metadata": { | |
| "colab": { | |
| "base_uri": "https://localhost:8080/", | |
| "height": 206 | |
| }, | |
| "id": "wJWzs7XJIZ7-", | |
| "outputId": "e8285f29-027e-4a96-c3a2-d547f9402e1b" | |
| }, | |
| "execution_count": 5, | |
| "outputs": [ | |
| { | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/plain": [ | |
| " comment_text label user_age \\\n", | |
| "0 rt mayasolovely woman complain cleaning house ... 0 29 \n", | |
| "1 rt mleew boy dat cold tyga dwn bad cuffin dat ... 0 25 \n", | |
| "2 rt urkindofbrand dawg rt sbaby life ever fuck ... 0 54 \n", | |
| "3 rt anderson viva based look like tranny 0 46 \n", | |
| "4 rt shenikaroberts shit hear might true might f... 0 66 \n", | |
| "\n", | |
| " user_location date_posted \n", | |
| "0 New York, USA 2022-01-08 \n", | |
| "1 Tokyo, Japan 2020-07-05 \n", | |
| "2 London, UK 2020-02-14 \n", | |
| "3 Tokyo, Japan 2020-02-19 \n", | |
| "4 Tokyo, Japan 2021-12-28 " | |
| ], | |
| "text/html": [ | |
| "\n", | |
| " <div id=\"df-a3727082-8354-4eaa-b297-effc64af5488\" class=\"colab-df-container\">\n", | |
| " <div>\n", | |
| "<style scoped>\n", | |
| " .dataframe tbody tr th:only-of-type {\n", | |
| " vertical-align: middle;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe tbody tr th {\n", | |
| " vertical-align: top;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe thead th {\n", | |
| " text-align: right;\n", | |
| " }\n", | |
| "</style>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>comment_text</th>\n", | |
| " <th>label</th>\n", | |
| " <th>user_age</th>\n", | |
| " <th>user_location</th>\n", | |
| " <th>date_posted</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>rt mayasolovely woman complain cleaning house ...</td>\n", | |
| " <td>0</td>\n", | |
| " <td>29</td>\n", | |
| " <td>New York, USA</td>\n", | |
| " <td>2022-01-08</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1</th>\n", | |
| " <td>rt mleew boy dat cold tyga dwn bad cuffin dat ...</td>\n", | |
| " <td>0</td>\n", | |
| " <td>25</td>\n", | |
| " <td>Tokyo, Japan</td>\n", | |
| " <td>2020-07-05</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2</th>\n", | |
| " <td>rt urkindofbrand dawg rt sbaby life ever fuck ...</td>\n", | |
| " <td>0</td>\n", | |
| " <td>54</td>\n", | |
| " <td>London, UK</td>\n", | |
| " <td>2020-02-14</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>3</th>\n", | |
| " <td>rt anderson viva based look like tranny</td>\n", | |
| " <td>0</td>\n", | |
| " <td>46</td>\n", | |
| " <td>Tokyo, Japan</td>\n", | |
| " <td>2020-02-19</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>4</th>\n", | |
| " <td>rt shenikaroberts shit hear might true might f...</td>\n", | |
| " <td>0</td>\n", | |
| " <td>66</td>\n", | |
| " <td>Tokyo, Japan</td>\n", | |
| " <td>2021-12-28</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>\n", | |
| " <div class=\"colab-df-buttons\">\n", | |
| "\n", | |
| " <div class=\"colab-df-container\">\n", | |
| " <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-a3727082-8354-4eaa-b297-effc64af5488')\"\n", | |
| " title=\"Convert this dataframe to an interactive table.\"\n", | |
| " style=\"display:none;\">\n", | |
| "\n", | |
| " <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\" viewBox=\"0 -960 960 960\">\n", | |
| " <path d=\"M120-120v-720h720v720H120Zm60-500h600v-160H180v160Zm220 220h160v-160H400v160Zm0 220h160v-160H400v160ZM180-400h160v-160H180v160Zm440 0h160v-160H620v160ZM180-180h160v-160H180v160Zm440 0h160v-160H620v160Z\"/>\n", | |
| " </svg>\n", | |
| " </button>\n", | |
| "\n", | |
| " <style>\n", | |
| " .colab-df-container {\n", | |
| " display:flex;\n", | |
| " gap: 12px;\n", | |
| " }\n", | |
| "\n", | |
| " .colab-df-convert {\n", | |
| " background-color: #E8F0FE;\n", | |
| " border: none;\n", | |
| " border-radius: 50%;\n", | |
| " cursor: pointer;\n", | |
| " display: none;\n", | |
| " fill: #1967D2;\n", | |
| " height: 32px;\n", | |
| " padding: 0 0 0 0;\n", | |
| " width: 32px;\n", | |
| " }\n", | |
| "\n", | |
| " .colab-df-convert:hover {\n", | |
| " background-color: #E2EBFA;\n", | |
| " box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n", | |
| " fill: #174EA6;\n", | |
| " }\n", | |
| "\n", | |
| " .colab-df-buttons div {\n", | |
| " margin-bottom: 4px;\n", | |
| " }\n", | |
| "\n", | |
| " [theme=dark] .colab-df-convert {\n", | |
| " background-color: #3B4455;\n", | |
| " fill: #D2E3FC;\n", | |
| " }\n", | |
| "\n", | |
| " [theme=dark] .colab-df-convert:hover {\n", | |
| " background-color: #434B5C;\n", | |
| " box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n", | |
| " filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n", | |
| " fill: #FFFFFF;\n", | |
| " }\n", | |
| " </style>\n", | |
| "\n", | |
| " <script>\n", | |
| " const buttonEl =\n", | |
| " document.querySelector('#df-a3727082-8354-4eaa-b297-effc64af5488 button.colab-df-convert');\n", | |
| " buttonEl.style.display =\n", | |
| " google.colab.kernel.accessAllowed ? 'block' : 'none';\n", | |
| "\n", | |
| " async function convertToInteractive(key) {\n", | |
| " const element = document.querySelector('#df-a3727082-8354-4eaa-b297-effc64af5488');\n", | |
| " const dataTable =\n", | |
| " await google.colab.kernel.invokeFunction('convertToInteractive',\n", | |
| " [key], {});\n", | |
| " if (!dataTable) return;\n", | |
| "\n", | |
| " const docLinkHtml = 'Like what you see? Visit the ' +\n", | |
| " '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n", | |
| " + ' to learn more about interactive tables.';\n", | |
| " element.innerHTML = '';\n", | |
| " dataTable['output_type'] = 'display_data';\n", | |
| " await google.colab.output.renderOutput(dataTable, element);\n", | |
| " const docLink = document.createElement('div');\n", | |
| " docLink.innerHTML = docLinkHtml;\n", | |
| " element.appendChild(docLink);\n", | |
| " }\n", | |
| " </script>\n", | |
| " </div>\n", | |
| "\n", | |
| "\n", | |
| " <div id=\"df-1f76e7b2-0b34-41ab-b2b4-2b98804c30da\">\n", | |
| " <button class=\"colab-df-quickchart\" onclick=\"quickchart('df-1f76e7b2-0b34-41ab-b2b4-2b98804c30da')\"\n", | |
| " title=\"Suggest charts\"\n", | |
| " style=\"display:none;\">\n", | |
| "\n", | |
| "<svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n", | |
| " width=\"24px\">\n", | |
| " <g>\n", | |
| " <path d=\"M19 3H5c-1.1 0-2 .9-2 2v14c0 1.1.9 2 2 2h14c1.1 0 2-.9 2-2V5c0-1.1-.9-2-2-2zM9 17H7v-7h2v7zm4 0h-2V7h2v10zm4 0h-2v-4h2v4z\"/>\n", | |
| " </g>\n", | |
| "</svg>\n", | |
| " </button>\n", | |
| "\n", | |
| "<style>\n", | |
| " .colab-df-quickchart {\n", | |
| " --bg-color: #E8F0FE;\n", | |
| " --fill-color: #1967D2;\n", | |
| " --hover-bg-color: #E2EBFA;\n", | |
| " --hover-fill-color: #174EA6;\n", | |
| " --disabled-fill-color: #AAA;\n", | |
| " --disabled-bg-color: #DDD;\n", | |
| " }\n", | |
| "\n", | |
| " [theme=dark] .colab-df-quickchart {\n", | |
| " --bg-color: #3B4455;\n", | |
| " --fill-color: #D2E3FC;\n", | |
| " --hover-bg-color: #434B5C;\n", | |
| " --hover-fill-color: #FFFFFF;\n", | |
| " --disabled-bg-color: #3B4455;\n", | |
| " --disabled-fill-color: #666;\n", | |
| " }\n", | |
| "\n", | |
| " .colab-df-quickchart {\n", | |
| " background-color: var(--bg-color);\n", | |
| " border: none;\n", | |
| " border-radius: 50%;\n", | |
| " cursor: pointer;\n", | |
| " display: none;\n", | |
| " fill: var(--fill-color);\n", | |
| " height: 32px;\n", | |
| " padding: 0;\n", | |
| " width: 32px;\n", | |
| " }\n", | |
| "\n", | |
| " .colab-df-quickchart:hover {\n", | |
| " background-color: var(--hover-bg-color);\n", | |
| " box-shadow: 0 1px 2px rgba(60, 64, 67, 0.3), 0 1px 3px 1px rgba(60, 64, 67, 0.15);\n", | |
| " fill: var(--button-hover-fill-color);\n", | |
| " }\n", | |
| "\n", | |
| " .colab-df-quickchart-complete:disabled,\n", | |
| " .colab-df-quickchart-complete:disabled:hover {\n", | |
| " background-color: var(--disabled-bg-color);\n", | |
| " fill: var(--disabled-fill-color);\n", | |
| " box-shadow: none;\n", | |
| " }\n", | |
| "\n", | |
| " .colab-df-spinner {\n", | |
| " border: 2px solid var(--fill-color);\n", | |
| " border-color: transparent;\n", | |
| " border-bottom-color: var(--fill-color);\n", | |
| " animation:\n", | |
| " spin 1s steps(1) infinite;\n", | |
| " }\n", | |
| "\n", | |
| " @keyframes spin {\n", | |
| " 0% {\n", | |
| " border-color: transparent;\n", | |
| " border-bottom-color: var(--fill-color);\n", | |
| " border-left-color: var(--fill-color);\n", | |
| " }\n", | |
| " 20% {\n", | |
| " border-color: transparent;\n", | |
| " border-left-color: var(--fill-color);\n", | |
| " border-top-color: var(--fill-color);\n", | |
| " }\n", | |
| " 30% {\n", | |
| " border-color: transparent;\n", | |
| " border-left-color: var(--fill-color);\n", | |
| " border-top-color: var(--fill-color);\n", | |
| " border-right-color: var(--fill-color);\n", | |
| " }\n", | |
| " 40% {\n", | |
| " border-color: transparent;\n", | |
| " border-right-color: var(--fill-color);\n", | |
| " border-top-color: var(--fill-color);\n", | |
| " }\n", | |
| " 60% {\n", | |
| " border-color: transparent;\n", | |
| " border-right-color: var(--fill-color);\n", | |
| " }\n", | |
| " 80% {\n", | |
| " border-color: transparent;\n", | |
| " border-right-color: var(--fill-color);\n", | |
| " border-bottom-color: var(--fill-color);\n", | |
| " }\n", | |
| " 90% {\n", | |
| " border-color: transparent;\n", | |
| " border-bottom-color: var(--fill-color);\n", | |
| " }\n", | |
| " }\n", | |
| "</style>\n", | |
| "\n", | |
| " <script>\n", | |
| " async function quickchart(key) {\n", | |
| " const quickchartButtonEl =\n", | |
| " document.querySelector('#' + key + ' button');\n", | |
| " quickchartButtonEl.disabled = true; // To prevent multiple clicks.\n", | |
| " quickchartButtonEl.classList.add('colab-df-spinner');\n", | |
| " try {\n", | |
| " const charts = await google.colab.kernel.invokeFunction(\n", | |
| " 'suggestCharts', [key], {});\n", | |
| " } catch (error) {\n", | |
| " console.error('Error during call to suggestCharts:', error);\n", | |
| " }\n", | |
| " quickchartButtonEl.classList.remove('colab-df-spinner');\n", | |
| " quickchartButtonEl.classList.add('colab-df-quickchart-complete');\n", | |
| " }\n", | |
| " (() => {\n", | |
| " let quickchartButtonEl =\n", | |
| " document.querySelector('#df-1f76e7b2-0b34-41ab-b2b4-2b98804c30da button');\n", | |
| " quickchartButtonEl.style.display =\n", | |
| " google.colab.kernel.accessAllowed ? 'block' : 'none';\n", | |
| " })();\n", | |
| " </script>\n", | |
| " </div>\n", | |
| "\n", | |
| " </div>\n", | |
| " </div>\n" | |
| ], | |
| "application/vnd.google.colaboratory.intrinsic+json": { | |
| "type": "dataframe", | |
| "variable_name": "df", | |
| "summary": "{\n \"name\": \"df\",\n \"rows\": 24783,\n \"fields\": [\n {\n \"column\": \"comment_text\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 24543,\n \"samples\": [\n \"rt letmelive stevie playa yr tho lhhatlreunion player stand solid behind bullshit bitch respect\",\n \"rt kevin mcadams stop little bitch fuckherrightinthepussy\",\n \"arionnee smh iont understand fuck hoe\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"label\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0,\n \"min\": 0,\n \"max\": 1,\n \"num_unique_values\": 2,\n \"samples\": [\n 1,\n 0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"user_age\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 16,\n \"min\": 13,\n \"max\": 69,\n \"num_unique_values\": 57,\n \"samples\": [\n 29,\n 28\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"user_location\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 5,\n \"samples\": [\n \"Tokyo, Japan\",\n \"Belgrade, Serbia\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"date_posted\",\n \"properties\": {\n \"dtype\": \"date\",\n \"min\": \"2020-01-01\",\n \"max\": \"2022-09-27\",\n \"num_unique_values\": 1001,\n \"samples\": [\n \"2021-03-09\",\n \"2020-04-15\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" | |
| } | |
| }, | |
| "metadata": {}, | |
| "execution_count": 5 | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "source": [ | |
| "COMMENT TEXT VECTORIZATION" | |
| ], | |
| "metadata": { | |
| "id": "1N5gW5E9QNpO" | |
| } | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "from sklearn.feature_extraction.text import TfidfVectorizer\n", | |
| "\n", | |
| "vectorizer = TfidfVectorizer(max_features=1000)\n", | |
| "X_tfidf = vectorizer.fit_transform(df[\"comment_text\"])\n", | |
| "print(f\"TF-IDF matrix shape: {X_tfidf.shape}\")" | |
| ], | |
| "metadata": { | |
| "colab": { | |
| "base_uri": "https://localhost:8080/" | |
| }, | |
| "id": "XQ_NVm5WJz6q", | |
| "outputId": "ec7f8515-327f-40fc-df31-2390ff1e0ecb" | |
| }, | |
| "execution_count": 6, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "name": "stdout", | |
| "text": [ | |
| "TF-IDF matrix shape: (24783, 1000)\n" | |
| ] | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "source": [ | |
| "TRAINING MODEL" | |
| ], | |
| "metadata": { | |
| "id": "BslDaaZoQnyn" | |
| } | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "from sklearn.preprocessing import OneHotEncoder\n", | |
| "from sklearn.compose import ColumnTransformer\n", | |
| "from sklearn.preprocessing import StandardScaler\n", | |
| "import scipy\n", | |
| "\n", | |
| "\n", | |
| "#------------------CHECK--START\n", | |
| "df['date_posted'] = pd.to_datetime(df['date_posted'])\n", | |
| "\n", | |
| "# Convert 'date_posted' to numerical feature (e.g., number of days since a reference date)\n", | |
| "reference_date = datetime(2020, 1, 1) # Reference date can be a datetime object\n", | |
| "df['date_posted_numeric'] = (df['date_posted'] - reference_date).dt.days\n", | |
| "\n", | |
| "# Define which columns to transform\n", | |
| "categorical_features = ['user_location']\n", | |
| "numerical_features = ['user_age', 'date_posted_numeric']\n", | |
| "\n", | |
| "# Create transformers for each column type\n", | |
| "preprocessor = ColumnTransformer(\n", | |
| " transformers=[\n", | |
| " ('num', StandardScaler(), numerical_features),\n", | |
| " ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)\n", | |
| " ],\n", | |
| " remainder='passthrough' # Keep other columns (like comment_text)\n", | |
| ")\n", | |
| "\n", | |
| "# Fit and transform the specified columns\n", | |
| "X_engineered_features = preprocessor.fit_transform(df[['user_age', 'user_location', 'date_posted_numeric']])\n", | |
| "#------------------CHECK--END\n", | |
| "\n", | |
| "# Combine TF-IDF with engineered features\n", | |
| "# X = scipy.sparse.hstack([X_tfidf]) #------------------CHECK--COMMENT\n", | |
| "X = scipy.sparse.hstack([X_tfidf, X_engineered_features])\n", | |
| "y = df[\"label\"]\n", | |
| "\n", | |
| "from sklearn.model_selection import train_test_split\n", | |
| "from sklearn.ensemble import RandomForestClassifier\n", | |
| "\n", | |
| "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y, random_state=42)\n", | |
| "clf = RandomForestClassifier(n_estimators=100, random_state=42)\n", | |
| "clf.fit(X_train, y_train)" | |
| ], | |
| "metadata": { | |
| "colab": { | |
| "base_uri": "https://localhost:8080/", | |
| "height": 80 | |
| }, | |
| "id": "Vv1cvjQ7J-Dv", | |
| "outputId": "809481ac-bfe6-48c4-a010-22ff83c00505" | |
| }, | |
| "execution_count": 7, | |
| "outputs": [ | |
| { | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/plain": [ | |
| "RandomForestClassifier(random_state=42)" | |
| ], | |
| "text/html": [ | |
| "<style>#sk-container-id-1 {\n", | |
| " /* Definition of color scheme common for light and dark mode */\n", | |
| " --sklearn-color-text: #000;\n", | |
| " --sklearn-color-text-muted: #666;\n", | |
| " --sklearn-color-line: gray;\n", | |
| " /* Definition of color scheme for unfitted estimators */\n", | |
| " --sklearn-color-unfitted-level-0: #fff5e6;\n", | |
| " --sklearn-color-unfitted-level-1: #f6e4d2;\n", | |
| " --sklearn-color-unfitted-level-2: #ffe0b3;\n", | |
| " --sklearn-color-unfitted-level-3: chocolate;\n", | |
| " /* Definition of color scheme for fitted estimators */\n", | |
| " --sklearn-color-fitted-level-0: #f0f8ff;\n", | |
| " --sklearn-color-fitted-level-1: #d4ebff;\n", | |
| " --sklearn-color-fitted-level-2: #b3dbfd;\n", | |
| " --sklearn-color-fitted-level-3: cornflowerblue;\n", | |
| "\n", | |
| " /* Specific color for light theme */\n", | |
| " --sklearn-color-text-on-default-background: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, black)));\n", | |
| " --sklearn-color-background: var(--sg-background-color, var(--theme-background, var(--jp-layout-color0, white)));\n", | |
| " --sklearn-color-border-box: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, black)));\n", | |
| " --sklearn-color-icon: #696969;\n", | |
| "\n", | |
| " @media (prefers-color-scheme: dark) {\n", | |
| " /* Redefinition of color scheme for dark theme */\n", | |
| " --sklearn-color-text-on-default-background: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, white)));\n", | |
| " --sklearn-color-background: var(--sg-background-color, var(--theme-background, var(--jp-layout-color0, #111)));\n", | |
| " --sklearn-color-border-box: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, white)));\n", | |
| " --sklearn-color-icon: #878787;\n", | |
| " }\n", | |
| "}\n", | |
| "\n", | |
| "#sk-container-id-1 {\n", | |
| " color: var(--sklearn-color-text);\n", | |
| "}\n", | |
| "\n", | |
| "#sk-container-id-1 pre {\n", | |
| " padding: 0;\n", | |
| "}\n", | |
| "\n", | |
| "#sk-container-id-1 input.sk-hidden--visually {\n", | |
| " border: 0;\n", | |
| " clip: rect(1px 1px 1px 1px);\n", | |
| " clip: rect(1px, 1px, 1px, 1px);\n", | |
| " height: 1px;\n", | |
| " margin: -1px;\n", | |
| " overflow: hidden;\n", | |
| " padding: 0;\n", | |
| " position: absolute;\n", | |
| " width: 1px;\n", | |
| "}\n", | |
| "\n", | |
| "#sk-container-id-1 div.sk-dashed-wrapped {\n", | |
| " border: 1px dashed var(--sklearn-color-line);\n", | |
| " margin: 0 0.4em 0.5em 0.4em;\n", | |
| " box-sizing: border-box;\n", | |
| " padding-bottom: 0.4em;\n", | |
| " background-color: var(--sklearn-color-background);\n", | |
| "}\n", | |
| "\n", | |
| "#sk-container-id-1 div.sk-container {\n", | |
| " /* jupyter's `normalize.less` sets `[hidden] { display: none; }`\n", | |
| " but bootstrap.min.css set `[hidden] { display: none !important; }`\n", | |
| " so we also need the `!important` here to be able to override the\n", | |
| " default hidden behavior on the sphinx rendered scikit-learn.org.\n", | |
| " See: https://github.com/scikit-learn/scikit-learn/issues/21755 */\n", | |
| " display: inline-block !important;\n", | |
| " position: relative;\n", | |
| "}\n", | |
| "\n", | |
| "#sk-container-id-1 div.sk-text-repr-fallback {\n", | |
| " display: none;\n", | |
| "}\n", | |
| "\n", | |
| "div.sk-parallel-item,\n", | |
| "div.sk-serial,\n", | |
| "div.sk-item {\n", | |
| " /* draw centered vertical line to link estimators */\n", | |
| " background-image: linear-gradient(var(--sklearn-color-text-on-default-background), var(--sklearn-color-text-on-default-background));\n", | |
| " background-size: 2px 100%;\n", | |
| " background-repeat: no-repeat;\n", | |
| " background-position: center center;\n", | |
| "}\n", | |
| "\n", | |
| "/* Parallel-specific style estimator block */\n", | |
| "\n", | |
| "#sk-container-id-1 div.sk-parallel-item::after {\n", | |
| " content: \"\";\n", | |
| " width: 100%;\n", | |
| " border-bottom: 2px solid var(--sklearn-color-text-on-default-background);\n", | |
| " flex-grow: 1;\n", | |
| "}\n", | |
| "\n", | |
| "#sk-container-id-1 div.sk-parallel {\n", | |
| " display: flex;\n", | |
| " align-items: stretch;\n", | |
| " justify-content: center;\n", | |
| " background-color: var(--sklearn-color-background);\n", | |
| " position: relative;\n", | |
| "}\n", | |
| "\n", | |
| "#sk-container-id-1 div.sk-parallel-item {\n", | |
| " display: flex;\n", | |
| " flex-direction: column;\n", | |
| "}\n", | |
| "\n", | |
| "#sk-container-id-1 div.sk-parallel-item:first-child::after {\n", | |
| " align-self: flex-end;\n", | |
| " width: 50%;\n", | |
| "}\n", | |
| "\n", | |
| "#sk-container-id-1 div.sk-parallel-item:last-child::after {\n", | |
| " align-self: flex-start;\n", | |
| " width: 50%;\n", | |
| "}\n", | |
| "\n", | |
| "#sk-container-id-1 div.sk-parallel-item:only-child::after {\n", | |
| " width: 0;\n", | |
| "}\n", | |
| "\n", | |
| "/* Serial-specific style estimator block */\n", | |
| "\n", | |
| "#sk-container-id-1 div.sk-serial {\n", | |
| " display: flex;\n", | |
| " flex-direction: column;\n", | |
| " align-items: center;\n", | |
| " background-color: var(--sklearn-color-background);\n", | |
| " padding-right: 1em;\n", | |
| " padding-left: 1em;\n", | |
| "}\n", | |
| "\n", | |
| "\n", | |
| "/* Toggleable style: style used for estimator/Pipeline/ColumnTransformer box that is\n", | |
| "clickable and can be expanded/collapsed.\n", | |
| "- Pipeline and ColumnTransformer use this feature and define the default style\n", | |
| "- Estimators will overwrite some part of the style using the `sk-estimator` class\n", | |
| "*/\n", | |
| "\n", | |
| "/* Pipeline and ColumnTransformer style (default) */\n", | |
| "\n", | |
| "#sk-container-id-1 div.sk-toggleable {\n", | |
| " /* Default theme specific background. It is overwritten whether we have a\n", | |
| " specific estimator or a Pipeline/ColumnTransformer */\n", | |
| " background-color: var(--sklearn-color-background);\n", | |
| "}\n", | |
| "\n", | |
| "/* Toggleable label */\n", | |
| "#sk-container-id-1 label.sk-toggleable__label {\n", | |
| " cursor: pointer;\n", | |
| " display: flex;\n", | |
| " width: 100%;\n", | |
| " margin-bottom: 0;\n", | |
| " padding: 0.5em;\n", | |
| " box-sizing: border-box;\n", | |
| " text-align: center;\n", | |
| " align-items: start;\n", | |
| " justify-content: space-between;\n", | |
| " gap: 0.5em;\n", | |
| "}\n", | |
| "\n", | |
| "#sk-container-id-1 label.sk-toggleable__label .caption {\n", | |
| " font-size: 0.6rem;\n", | |
| " font-weight: lighter;\n", | |
| " color: var(--sklearn-color-text-muted);\n", | |
| "}\n", | |
| "\n", | |
| "#sk-container-id-1 label.sk-toggleable__label-arrow:before {\n", | |
| " /* Arrow on the left of the label */\n", | |
| " content: \"▸\";\n", | |
| " float: left;\n", | |
| " margin-right: 0.25em;\n", | |
| " color: var(--sklearn-color-icon);\n", | |
| "}\n", | |
| "\n", | |
| "#sk-container-id-1 label.sk-toggleable__label-arrow:hover:before {\n", | |
| " color: var(--sklearn-color-text);\n", | |
| "}\n", | |
| "\n", | |
| "/* Toggleable content - dropdown */\n", | |
| "\n", | |
| "#sk-container-id-1 div.sk-toggleable__content {\n", | |
| " max-height: 0;\n", | |
| " max-width: 0;\n", | |
| " overflow: hidden;\n", | |
| " text-align: left;\n", | |
| " /* unfitted */\n", | |
| " background-color: var(--sklearn-color-unfitted-level-0);\n", | |
| "}\n", | |
| "\n", | |
| "#sk-container-id-1 div.sk-toggleable__content.fitted {\n", | |
| " /* fitted */\n", | |
| " background-color: var(--sklearn-color-fitted-level-0);\n", | |
| "}\n", | |
| "\n", | |
| "#sk-container-id-1 div.sk-toggleable__content pre {\n", | |
| " margin: 0.2em;\n", | |
| " border-radius: 0.25em;\n", | |
| " color: var(--sklearn-color-text);\n", | |
| " /* unfitted */\n", | |
| " background-color: var(--sklearn-color-unfitted-level-0);\n", | |
| "}\n", | |
| "\n", | |
| "#sk-container-id-1 div.sk-toggleable__content.fitted pre {\n", | |
| " /* unfitted */\n", | |
| " background-color: var(--sklearn-color-fitted-level-0);\n", | |
| "}\n", | |
| "\n", | |
| "#sk-container-id-1 input.sk-toggleable__control:checked~div.sk-toggleable__content {\n", | |
| " /* Expand drop-down */\n", | |
| " max-height: 200px;\n", | |
| " max-width: 100%;\n", | |
| " overflow: auto;\n", | |
| "}\n", | |
| "\n", | |
| "#sk-container-id-1 input.sk-toggleable__control:checked~label.sk-toggleable__label-arrow:before {\n", | |
| " content: \"▾\";\n", | |
| "}\n", | |
| "\n", | |
| "/* Pipeline/ColumnTransformer-specific style */\n", | |
| "\n", | |
| "#sk-container-id-1 div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {\n", | |
| " color: var(--sklearn-color-text);\n", | |
| " background-color: var(--sklearn-color-unfitted-level-2);\n", | |
| "}\n", | |
| "\n", | |
| "#sk-container-id-1 div.sk-label.fitted input.sk-toggleable__control:checked~label.sk-toggleable__label {\n", | |
| " background-color: var(--sklearn-color-fitted-level-2);\n", | |
| "}\n", | |
| "\n", | |
| "/* Estimator-specific style */\n", | |
| "\n", | |
| "/* Colorize estimator box */\n", | |
| "#sk-container-id-1 div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {\n", | |
| " /* unfitted */\n", | |
| " background-color: var(--sklearn-color-unfitted-level-2);\n", | |
| "}\n", | |
| "\n", | |
| "#sk-container-id-1 div.sk-estimator.fitted input.sk-toggleable__control:checked~label.sk-toggleable__label {\n", | |
| " /* fitted */\n", | |
| " background-color: var(--sklearn-color-fitted-level-2);\n", | |
| "}\n", | |
| "\n", | |
| "#sk-container-id-1 div.sk-label label.sk-toggleable__label,\n", | |
| "#sk-container-id-1 div.sk-label label {\n", | |
| " /* The background is the default theme color */\n", | |
| " color: var(--sklearn-color-text-on-default-background);\n", | |
| "}\n", | |
| "\n", | |
| "/* On hover, darken the color of the background */\n", | |
| "#sk-container-id-1 div.sk-label:hover label.sk-toggleable__label {\n", | |
| " color: var(--sklearn-color-text);\n", | |
| " background-color: var(--sklearn-color-unfitted-level-2);\n", | |
| "}\n", | |
| "\n", | |
| "/* Label box, darken color on hover, fitted */\n", | |
| "#sk-container-id-1 div.sk-label.fitted:hover label.sk-toggleable__label.fitted {\n", | |
| " color: var(--sklearn-color-text);\n", | |
| " background-color: var(--sklearn-color-fitted-level-2);\n", | |
| "}\n", | |
| "\n", | |
| "/* Estimator label */\n", | |
| "\n", | |
| "#sk-container-id-1 div.sk-label label {\n", | |
| " font-family: monospace;\n", | |
| " font-weight: bold;\n", | |
| " display: inline-block;\n", | |
| " line-height: 1.2em;\n", | |
| "}\n", | |
| "\n", | |
| "#sk-container-id-1 div.sk-label-container {\n", | |
| " text-align: center;\n", | |
| "}\n", | |
| "\n", | |
| "/* Estimator-specific */\n", | |
| "#sk-container-id-1 div.sk-estimator {\n", | |
| " font-family: monospace;\n", | |
| " border: 1px dotted var(--sklearn-color-border-box);\n", | |
| " border-radius: 0.25em;\n", | |
| " box-sizing: border-box;\n", | |
| " margin-bottom: 0.5em;\n", | |
| " /* unfitted */\n", | |
| " background-color: var(--sklearn-color-unfitted-level-0);\n", | |
| "}\n", | |
| "\n", | |
| "#sk-container-id-1 div.sk-estimator.fitted {\n", | |
| " /* fitted */\n", | |
| " background-color: var(--sklearn-color-fitted-level-0);\n", | |
| "}\n", | |
| "\n", | |
| "/* on hover */\n", | |
| "#sk-container-id-1 div.sk-estimator:hover {\n", | |
| " /* unfitted */\n", | |
| " background-color: var(--sklearn-color-unfitted-level-2);\n", | |
| "}\n", | |
| "\n", | |
| "#sk-container-id-1 div.sk-estimator.fitted:hover {\n", | |
| " /* fitted */\n", | |
| " background-color: var(--sklearn-color-fitted-level-2);\n", | |
| "}\n", | |
| "\n", | |
| "/* Specification for estimator info (e.g. \"i\" and \"?\") */\n", | |
| "\n", | |
| "/* Common style for \"i\" and \"?\" */\n", | |
| "\n", | |
| ".sk-estimator-doc-link,\n", | |
| "a:link.sk-estimator-doc-link,\n", | |
| "a:visited.sk-estimator-doc-link {\n", | |
| " float: right;\n", | |
| " font-size: smaller;\n", | |
| " line-height: 1em;\n", | |
| " font-family: monospace;\n", | |
| " background-color: var(--sklearn-color-background);\n", | |
| " border-radius: 1em;\n", | |
| " height: 1em;\n", | |
| " width: 1em;\n", | |
| " text-decoration: none !important;\n", | |
| " margin-left: 0.5em;\n", | |
| " text-align: center;\n", | |
| " /* unfitted */\n", | |
| " border: var(--sklearn-color-unfitted-level-1) 1pt solid;\n", | |
| " color: var(--sklearn-color-unfitted-level-1);\n", | |
| "}\n", | |
| "\n", | |
| ".sk-estimator-doc-link.fitted,\n", | |
| "a:link.sk-estimator-doc-link.fitted,\n", | |
| "a:visited.sk-estimator-doc-link.fitted {\n", | |
| " /* fitted */\n", | |
| " border: var(--sklearn-color-fitted-level-1) 1pt solid;\n", | |
| " color: var(--sklearn-color-fitted-level-1);\n", | |
| "}\n", | |
| "\n", | |
| "/* On hover */\n", | |
| "div.sk-estimator:hover .sk-estimator-doc-link:hover,\n", | |
| ".sk-estimator-doc-link:hover,\n", | |
| "div.sk-label-container:hover .sk-estimator-doc-link:hover,\n", | |
| ".sk-estimator-doc-link:hover {\n", | |
| " /* unfitted */\n", | |
| " background-color: var(--sklearn-color-unfitted-level-3);\n", | |
| " color: var(--sklearn-color-background);\n", | |
| " text-decoration: none;\n", | |
| "}\n", | |
| "\n", | |
| "div.sk-estimator.fitted:hover .sk-estimator-doc-link.fitted:hover,\n", | |
| ".sk-estimator-doc-link.fitted:hover,\n", | |
| "div.sk-label-container:hover .sk-estimator-doc-link.fitted:hover,\n", | |
| ".sk-estimator-doc-link.fitted:hover {\n", | |
| " /* fitted */\n", | |
| " background-color: var(--sklearn-color-fitted-level-3);\n", | |
| " color: var(--sklearn-color-background);\n", | |
| " text-decoration: none;\n", | |
| "}\n", | |
| "\n", | |
| "/* Span, style for the box shown on hovering the info icon */\n", | |
| ".sk-estimator-doc-link span {\n", | |
| " display: none;\n", | |
| " z-index: 9999;\n", | |
| " position: relative;\n", | |
| " font-weight: normal;\n", | |
| " right: .2ex;\n", | |
| " padding: .5ex;\n", | |
| " margin: .5ex;\n", | |
| " width: min-content;\n", | |
| " min-width: 20ex;\n", | |
| " max-width: 50ex;\n", | |
| " color: var(--sklearn-color-text);\n", | |
| " box-shadow: 2pt 2pt 4pt #999;\n", | |
| " /* unfitted */\n", | |
| " background: var(--sklearn-color-unfitted-level-0);\n", | |
| " border: .5pt solid var(--sklearn-color-unfitted-level-3);\n", | |
| "}\n", | |
| "\n", | |
| ".sk-estimator-doc-link.fitted span {\n", | |
| " /* fitted */\n", | |
| " background: var(--sklearn-color-fitted-level-0);\n", | |
| " border: var(--sklearn-color-fitted-level-3);\n", | |
| "}\n", | |
| "\n", | |
| ".sk-estimator-doc-link:hover span {\n", | |
| " display: block;\n", | |
| "}\n", | |
| "\n", | |
| "/* \"?\"-specific style due to the `<a>` HTML tag */\n", | |
| "\n", | |
| "#sk-container-id-1 a.estimator_doc_link {\n", | |
| " float: right;\n", | |
| " font-size: 1rem;\n", | |
| " line-height: 1em;\n", | |
| " font-family: monospace;\n", | |
| " background-color: var(--sklearn-color-background);\n", | |
| " border-radius: 1rem;\n", | |
| " height: 1rem;\n", | |
| " width: 1rem;\n", | |
| " text-decoration: none;\n", | |
| " /* unfitted */\n", | |
| " color: var(--sklearn-color-unfitted-level-1);\n", | |
| " border: var(--sklearn-color-unfitted-level-1) 1pt solid;\n", | |
| "}\n", | |
| "\n", | |
| "#sk-container-id-1 a.estimator_doc_link.fitted {\n", | |
| " /* fitted */\n", | |
| " border: var(--sklearn-color-fitted-level-1) 1pt solid;\n", | |
| " color: var(--sklearn-color-fitted-level-1);\n", | |
| "}\n", | |
| "\n", | |
| "/* On hover */\n", | |
| "#sk-container-id-1 a.estimator_doc_link:hover {\n", | |
| " /* unfitted */\n", | |
| " background-color: var(--sklearn-color-unfitted-level-3);\n", | |
| " color: var(--sklearn-color-background);\n", | |
| " text-decoration: none;\n", | |
| "}\n", | |
| "\n", | |
| "#sk-container-id-1 a.estimator_doc_link.fitted:hover {\n", | |
| " /* fitted */\n", | |
| " background-color: var(--sklearn-color-fitted-level-3);\n", | |
| "}\n", | |
| "</style><div id=\"sk-container-id-1\" class=\"sk-top-container\"><div class=\"sk-text-repr-fallback\"><pre>RandomForestClassifier(random_state=42)</pre><b>In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. <br />On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.</b></div><div class=\"sk-container\" hidden><div class=\"sk-item\"><div class=\"sk-estimator fitted sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-1\" type=\"checkbox\" checked><label for=\"sk-estimator-id-1\" class=\"sk-toggleable__label fitted sk-toggleable__label-arrow\"><div><div>RandomForestClassifier</div></div><div><a class=\"sk-estimator-doc-link fitted\" rel=\"noreferrer\" target=\"_blank\" href=\"https://scikit-learn.org/1.6/modules/generated/sklearn.ensemble.RandomForestClassifier.html\">?<span>Documentation for RandomForestClassifier</span></a><span class=\"sk-estimator-doc-link fitted\">i<span>Fitted</span></span></div></label><div class=\"sk-toggleable__content fitted\"><pre>RandomForestClassifier(random_state=42)</pre></div> </div></div></div></div>" | |
| ] | |
| }, | |
| "metadata": {}, | |
| "execution_count": 7 | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "source": [ | |
| "METRICS AS NEEDED" | |
| ], | |
| "metadata": { | |
| "id": "42btvqD5QV89" | |
| } | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "from sklearn.metrics import classification_report\n", | |
| "\n", | |
| "y_pred = clf.predict(X_test)\n", | |
| "report = classification_report(y_test, y_pred, target_names=[\"Non-Hate\",\"Hate\"])\n", | |
| "print(report)" | |
| ], | |
| "metadata": { | |
| "colab": { | |
| "base_uri": "https://localhost:8080/" | |
| }, | |
| "id": "W_mLP1McKbhf", | |
| "outputId": "79adaf57-578c-4b8a-fa63-b4010f1b1fc2" | |
| }, | |
| "execution_count": 8, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "name": "stdout", | |
| "text": [ | |
| " precision recall f1-score support\n", | |
| "\n", | |
| " Non-Hate 0.95 0.99 0.97 5838\n", | |
| " Hate 0.48 0.11 0.18 358\n", | |
| "\n", | |
| " accuracy 0.94 6196\n", | |
| " macro avg 0.71 0.55 0.58 6196\n", | |
| "weighted avg 0.92 0.94 0.92 6196\n", | |
| "\n" | |
| ] | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "source": [ | |
| "CHECKING DISTRIUBUTION" | |
| ], | |
| "metadata": { | |
| "id": "UmJgItkqQbU_" | |
| } | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "# Label distribution\n", | |
| "dist = df[\"label\"].value_counts()\n", | |
| "print(\"Label distribution (0=Non-Hate, 1=Hate):\")\n", | |
| "print(dist)" | |
| ], | |
| "metadata": { | |
| "colab": { | |
| "base_uri": "https://localhost:8080/" | |
| }, | |
| "id": "AmvaxpZbL4DO", | |
| "outputId": "9286bc0a-af08-4f25-c653-794aabbfa196" | |
| }, | |
| "execution_count": 9, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "name": "stdout", | |
| "text": [ | |
| "Label distribution (0=Non-Hate, 1=Hate):\n", | |
| "label\n", | |
| "0 23353\n", | |
| "1 1430\n", | |
| "Name: count, dtype: int64\n" | |
| ] | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "source": [ | |
| "ENTER A TEXT AND PREDICT IT'S SCORE" | |
| ], | |
| "metadata": { | |
| "id": "kXAHcoaTQhsN" | |
| } | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "def predict_hate_speech(comment_text, user_age, user_location, date_posted_str):\n", | |
| " # --- Preprocess the text ---\n", | |
| " # Clean and lemmatize\n", | |
| " text_clean = clean_and_lemmatize(comment_text)\n", | |
| " # TF-IDF transform\n", | |
| " X_text = vectorizer.transform([text_clean])\n", | |
| "\n", | |
| " #------------------CHECK--START\n", | |
| " # --- Process metadata ---\n", | |
| " # Convert date to numeric\n", | |
| " date_posted = pd.to_datetime(date_posted_str)\n", | |
| " date_posted_numeric = (date_posted - reference_date).days\n", | |
| "\n", | |
| " # Build DataFrame for preprocessor\n", | |
| " df_meta = pd.DataFrame([{\n", | |
| " 'user_age': user_age,\n", | |
| " 'user_location': user_location,\n", | |
| " 'date_posted_numeric': date_posted_numeric\n", | |
| " }])\n", | |
| "\n", | |
| "\n", | |
| " # Transform metadata\n", | |
| " X_meta = preprocessor.transform(df_meta)\n", | |
| " #------------------CHECK--END\n", | |
| "\n", | |
| " # --- Combine features ---\n", | |
| " import scipy\n", | |
| " # X_combined = scipy.sparse.hstack([X_text]) #------------------CHECK--COMMENT\n", | |
| " X_combined = scipy.sparse.hstack([X_text, X_meta])\n", | |
| "\n", | |
| " # --- Predict ---\n", | |
| " prediction = clf.predict(X_combined)[0]\n", | |
| " proba = clf.predict_proba(X_combined)[0]\n", | |
| "\n", | |
| " # --- Output ---\n", | |
| " label = \"Hate Speech\" if prediction == 1 else \"Non-Hate\"\n", | |
| " print(f\"Prediction: {label}\")\n", | |
| " print(f\"Confidence: {proba[prediction]:.2f}\")\n", | |
| "\n", | |
| "# Example usage:\n", | |
| "hello=predict_hate_speech(\n", | |
| " comment_text=\" @CB_Baby24: @white_thunduh alsarabsss hes a beaner smh you can tell hes a mexican,\",\n", | |
| " user_age=25,\n", | |
| " user_location=\"London, UK\",\n", | |
| " date_posted_str=\"2020-05-15\"\n", | |
| ")" | |
| ], | |
| "metadata": { | |
| "colab": { | |
| "base_uri": "https://localhost:8080/" | |
| }, | |
| "id": "iH2WFxgaMDYy", | |
| "outputId": "a51e545f-5f4b-48c0-9c84-437d649c7e03" | |
| }, | |
| "execution_count": 10, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "name": "stdout", | |
| "text": [ | |
| "Prediction: Non-Hate\n", | |
| "Confidence: 0.79\n" | |
| ] | |
| } | |
| ] | |
| } | |
| ] | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment