🎉 (Spider): first commit and EDA

arkhn · Dec 6, 2023 · 4933295 · 4933295
1 parent b64873f
commit 4933295
Show file tree

Hide file tree

Showing 4 changed files with 2,394 additions and 0 deletions.
diff --git a/lib/spider/README.md b/lib/spider/README.md
diff --git a/lib/spider/notebooks/eda.ipynb b/lib/spider/notebooks/eda.ipynb
@@ -0,0 +1,273 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "4480527f3256e044",
+   "metadata": {},
+   "source": [
+    "Dataset archive downloaded from https://drive.google.com/uc?export=download&id=1TqleXec_OykOYFREKKtschzY29dUcVAQ"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2408797a0a334bd7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!unzip ../data/spider.zip"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ee86f2a07265fa2a",
+   "metadata": {},
+   "source": [
+    "# Dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e39ff161-ce51-4341-8cae-f8b8f4be2828",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pathlib import Path\n",
+    "import pandas as pd\n",
+    "import plotly.express as px"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "92764af2-bf21-4336-a516-1439e8098d06",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dataset_path = Path(\"../data/spider\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c03202bb-8953-4cd8-82ef-ca5965ed303c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_spider = pd.read_json(dataset_path / \"train_spider.json\")\n",
+    "train_others = pd.read_json(dataset_path / \"train_others.json\")\n",
+    "dev = pd.read_json(dataset_path / \"dev.json\")\n",
+    "train_spider.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ccf9ef9acfa640f4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(\"train_spider: \", len(train_spider))\n",
+    "print(\"train_others: \", len(train_others))\n",
+    "print(\"dev: \", len(dev))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d4da79a493c04a28",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for df, name in zip([train_spider, train_others, dev], [\"train_spider\", \"train_others\", \"dev\"]):\n",
+    "    fig = px.histogram([len(x) for x in df.question_toks])\n",
+    "    fig.update_layout(\n",
+    "        showlegend=False,\n",
+    "        xaxis_title=\"Number of tokens\",\n",
+    "        title={\n",
+    "            'text': f\"Distribution of the number of tokens in {name} questions\",\n",
+    "            'y':0.95,\n",
+    "            'x':0.5,\n",
+    "            'xanchor': 'center',\n",
+    "            'yanchor': 'top'\n",
+    "        }\n",
+    "    )\n",
+    "    fig.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cad7e8f256c3d994",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for df, name in zip([train_spider, train_others, dev], [\"train_spider\", \"train_others\", \"dev\"]):\n",
+    "    fig = px.histogram([len(x) for x in df.query_toks])\n",
+    "    fig.update_layout(\n",
+    "        showlegend=False,\n",
+    "        xaxis_title=\"Number of tokens\",\n",
+    "        title={\n",
+    "            'text': f\"Distribution of the number of tokens in {name} queries\",\n",
+    "            'y':0.95,\n",
+    "            'x':0.5,\n",
+    "            'xanchor': 'center',\n",
+    "            'yanchor': 'top'\n",
+    "        }\n",
+    "    )\n",
+    "    fig.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4059d45661d6945c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for df, name in zip([train_spider, train_others, dev], [\"train_spider\", \"train_others\", \"dev\"]):\n",
+    "    fig = px.histogram(df, x=\"db_id\").update_xaxes(categoryorder=\"category ascending\")\n",
+    "    fig.update_layout(\n",
+    "        showlegend=False,\n",
+    "        title={\n",
+    "            'text': f\"Databases used in {name}\",\n",
+    "            'y':0.95,\n",
+    "            'x':0.5,\n",
+    "            'xanchor': 'center',\n",
+    "            'yanchor': 'top'\n",
+    "        }\n",
+    "    )\n",
+    "    fig.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f8809e118ffb551a",
+   "metadata": {},
+   "source": [
+    "# Databases "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4b19828abc0d7607",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sqlite3"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "319829d9d8b70877",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "databases_path = list((dataset_path / \"database\").glob(\"*/*.sqlite\"))\n",
+    "print(\"Databases:\", len(databases_path))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ab57cd2ef258da5d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "n_tables = {}\n",
+    "for db_path in databases_path:\n",
+    "    conn = sqlite3.connect(str(db_path))\n",
+    "    cursor = conn.cursor()\n",
+    "    cursor.execute(\"SELECT count(*) FROM sqlite_master WHERE type='table';\")\n",
+    "    n_tables[db_path.name] = cursor.fetchone()[0]\n",
+    "\n",
+    "fig = px.histogram(n_tables.values())\n",
+    "fig.update_layout(\n",
+    "    showlegend=False,\n",
+    "    xaxis_title=\"Number of tables\",\n",
+    "    title={\n",
+    "        'text': f\"Distribution of the number of tables in the databases\",\n",
+    "        'y':0.95,\n",
+    "        'x':0.5,\n",
+    "        'xanchor': 'center',\n",
+    "        'yanchor': 'top'\n",
+    "    }\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1c4f8f376952b51d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(f\"Average number of tables: {sum(n_tables.values()) / len(n_tables.values())}\")\n",
+    "print(f\"Max number of tables: {max(n_tables, key=n_tables.get)} with {n_tables[max(n_tables, key=n_tables.get)]} tables\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d5e0ca0825a65980",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "n_columns = []\n",
+    "for db_path in databases_path:\n",
+    "    conn = sqlite3.connect(str(db_path))\n",
+    "    cursor = conn.cursor()\n",
+    "    cursor.execute(\"SELECT name FROM sqlite_master WHERE type='table';\")\n",
+    "    tables = cursor.fetchall()\n",
+    "    for table in tables:\n",
+    "        cursor.execute(f\"SELECT COUNT(*) FROM pragma_table_info('{table[0]}')\")\n",
+    "        n_columns.append(cursor.fetchone()[0])\n",
+    "\n",
+    "fig = px.histogram(n_columns)\n",
+    "fig.update_layout(\n",
+    "    showlegend=False,\n",
+    "    xaxis_title=\"Number of columns\",\n",
+    "    title={\n",
+    "        'text': f\"Distribution of the number of columns in the databases' tables\",\n",
+    "        'y':0.95,\n",
+    "        'x':0.5,\n",
+    "        'xanchor': 'center',\n",
+    "        'yanchor': 'top'\n",
+    "    }\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cb8ba5e9a53f92d4",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "spider-py3.11",
+   "language": "python",
+   "name": "spider-py3.11"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}