Skip to content

Commit

Permalink
🎉 (Spider): first commit and EDA
Browse files Browse the repository at this point in the history
  • Loading branch information
louni-g committed Dec 6, 2023
1 parent b64873f commit 4933295
Show file tree
Hide file tree
Showing 4 changed files with 2,394 additions and 0 deletions.
Empty file added lib/spider/README.md
Empty file.
273 changes: 273 additions & 0 deletions lib/spider/notebooks/eda.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,273 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "4480527f3256e044",
"metadata": {},
"source": [
"Dataset archive downloaded from https://drive.google.com/uc?export=download&id=1TqleXec_OykOYFREKKtschzY29dUcVAQ"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "2408797a0a334bd7",
"metadata": {},
"outputs": [],
"source": [
"!unzip ../data/spider.zip"
]
},
{
"cell_type": "markdown",
"id": "ee86f2a07265fa2a",
"metadata": {},
"source": [
"# Dataset"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e39ff161-ce51-4341-8cae-f8b8f4be2828",
"metadata": {},
"outputs": [],
"source": [
"from pathlib import Path\n",
"import pandas as pd\n",
"import plotly.express as px"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "92764af2-bf21-4336-a516-1439e8098d06",
"metadata": {},
"outputs": [],
"source": [
"dataset_path = Path(\"../data/spider\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c03202bb-8953-4cd8-82ef-ca5965ed303c",
"metadata": {},
"outputs": [],
"source": [
"train_spider = pd.read_json(dataset_path / \"train_spider.json\")\n",
"train_others = pd.read_json(dataset_path / \"train_others.json\")\n",
"dev = pd.read_json(dataset_path / \"dev.json\")\n",
"train_spider.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ccf9ef9acfa640f4",
"metadata": {},
"outputs": [],
"source": [
"print(\"train_spider: \", len(train_spider))\n",
"print(\"train_others: \", len(train_others))\n",
"print(\"dev: \", len(dev))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d4da79a493c04a28",
"metadata": {},
"outputs": [],
"source": [
"for df, name in zip([train_spider, train_others, dev], [\"train_spider\", \"train_others\", \"dev\"]):\n",
" fig = px.histogram([len(x) for x in df.question_toks])\n",
" fig.update_layout(\n",
" showlegend=False,\n",
" xaxis_title=\"Number of tokens\",\n",
" title={\n",
" 'text': f\"Distribution of the number of tokens in {name} questions\",\n",
" 'y':0.95,\n",
" 'x':0.5,\n",
" 'xanchor': 'center',\n",
" 'yanchor': 'top'\n",
" }\n",
" )\n",
" fig.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "cad7e8f256c3d994",
"metadata": {},
"outputs": [],
"source": [
"for df, name in zip([train_spider, train_others, dev], [\"train_spider\", \"train_others\", \"dev\"]):\n",
" fig = px.histogram([len(x) for x in df.query_toks])\n",
" fig.update_layout(\n",
" showlegend=False,\n",
" xaxis_title=\"Number of tokens\",\n",
" title={\n",
" 'text': f\"Distribution of the number of tokens in {name} queries\",\n",
" 'y':0.95,\n",
" 'x':0.5,\n",
" 'xanchor': 'center',\n",
" 'yanchor': 'top'\n",
" }\n",
" )\n",
" fig.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "4059d45661d6945c",
"metadata": {},
"outputs": [],
"source": [
"for df, name in zip([train_spider, train_others, dev], [\"train_spider\", \"train_others\", \"dev\"]):\n",
" fig = px.histogram(df, x=\"db_id\").update_xaxes(categoryorder=\"category ascending\")\n",
" fig.update_layout(\n",
" showlegend=False,\n",
" title={\n",
" 'text': f\"Databases used in {name}\",\n",
" 'y':0.95,\n",
" 'x':0.5,\n",
" 'xanchor': 'center',\n",
" 'yanchor': 'top'\n",
" }\n",
" )\n",
" fig.show()"
]
},
{
"cell_type": "markdown",
"id": "f8809e118ffb551a",
"metadata": {},
"source": [
"# Databases "
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "4b19828abc0d7607",
"metadata": {},
"outputs": [],
"source": [
"import sqlite3"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "319829d9d8b70877",
"metadata": {},
"outputs": [],
"source": [
"databases_path = list((dataset_path / \"database\").glob(\"*/*.sqlite\"))\n",
"print(\"Databases:\", len(databases_path))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ab57cd2ef258da5d",
"metadata": {},
"outputs": [],
"source": [
"n_tables = {}\n",
"for db_path in databases_path:\n",
" conn = sqlite3.connect(str(db_path))\n",
" cursor = conn.cursor()\n",
" cursor.execute(\"SELECT count(*) FROM sqlite_master WHERE type='table';\")\n",
" n_tables[db_path.name] = cursor.fetchone()[0]\n",
"\n",
"fig = px.histogram(n_tables.values())\n",
"fig.update_layout(\n",
" showlegend=False,\n",
" xaxis_title=\"Number of tables\",\n",
" title={\n",
" 'text': f\"Distribution of the number of tables in the databases\",\n",
" 'y':0.95,\n",
" 'x':0.5,\n",
" 'xanchor': 'center',\n",
" 'yanchor': 'top'\n",
" }\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1c4f8f376952b51d",
"metadata": {},
"outputs": [],
"source": [
"print(f\"Average number of tables: {sum(n_tables.values()) / len(n_tables.values())}\")\n",
"print(f\"Max number of tables: {max(n_tables, key=n_tables.get)} with {n_tables[max(n_tables, key=n_tables.get)]} tables\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d5e0ca0825a65980",
"metadata": {},
"outputs": [],
"source": [
"n_columns = []\n",
"for db_path in databases_path:\n",
" conn = sqlite3.connect(str(db_path))\n",
" cursor = conn.cursor()\n",
" cursor.execute(\"SELECT name FROM sqlite_master WHERE type='table';\")\n",
" tables = cursor.fetchall()\n",
" for table in tables:\n",
" cursor.execute(f\"SELECT COUNT(*) FROM pragma_table_info('{table[0]}')\")\n",
" n_columns.append(cursor.fetchone()[0])\n",
"\n",
"fig = px.histogram(n_columns)\n",
"fig.update_layout(\n",
" showlegend=False,\n",
" xaxis_title=\"Number of columns\",\n",
" title={\n",
" 'text': f\"Distribution of the number of columns in the databases' tables\",\n",
" 'y':0.95,\n",
" 'x':0.5,\n",
" 'xanchor': 'center',\n",
" 'yanchor': 'top'\n",
" }\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "cb8ba5e9a53f92d4",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "spider-py3.11",
"language": "python",
"name": "spider-py3.11"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.5"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Loading

0 comments on commit 4933295

Please sign in to comment.