Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add gpt4all support #2

Open
wants to merge 23 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion README.md

This file was deleted.

11 changes: 11 additions & 0 deletions analyse.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
import pandas as pd

dirty_test_df = pd.read_csv("/home/preethi/projects/pandas-ai-integration/data/Airbnb/missing_values/dirty_test.csv")
print("Dirty test len: ", len(dirty_test_df))

delete_test_df = pd.read_csv("/home/preethi/projects/pandas-ai-integration/data/Airbnb/missing_values/dirty_train.csv")
print("Dirty train len: ", len(delete_test_df))

cleaned_df = pd.read_csv("/home/preethi/projects/pandas-ai-integration/cleaned_df.csv")
print("Cleaned df: ", len(cleaned_df))
# Get the number of columns
167 changes: 167 additions & 0 deletions benchmark.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,167 @@
import os
import pandas as pd
import evadb
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score

cursor = evadb.connect().cursor()
print("Connected to EvaDB")
#local
# create_function_query = f"""CREATE FUNCTION IF NOT EXISTS ChatWithPandas
# IMPL './functions/chat_with_df.py' use_local_llm 'True' local_llm_model "llama-2-7b-chat.ggmlv3.q4_0.bin" csv_path "./data/cars.csv";
# """

create_function_query = f"""CREATE FUNCTION IF NOT EXISTS ChatWithPandas
IMPL './functions/chat_with_df.py';
"""

cursor.query("DROP FUNCTION IF EXISTS ChatWithPandas;").execute()

cursor.query(create_function_query).execute()
print("Created Function")

create_table_query = f"""CREATE TABLE IF NOT EXISTS AIRBNB_DATA5(
bathrooms FLOAT(64, 64),
bedrooms FLOAT(64, 64),
beds FLOAT(64, 64),
location_name TEXT(255),
num_guests FLOAT(64, 64),
num_reviews FLOAT(64, 64),
price FLOAT(64, 64),
rating TEXT(225),
latitude FLOAT(64, 64),
longitude FLOAT(64, 64),
zipcode TEXT(10),
pop2016 FLOAT(64, 64),
pop2010 FLOAT(64, 64),
pop2000 FLOAT(64, 64),
cost_living_index FLOAT(64, 64),
land_area FLOAT(64, 64),
water_area FLOAT(64, 64),
pop_density INTEGER,
number_of_males INTEGER,
number_of_females INTEGER,
prop_taxes_paid_2016 FLOAT(64, 64),
median_taxes_with_mortgage FLOAT(64, 64),
median_taxes_no_mortgage FLOAT(64, 64),
median_house_value FLOAT(64, 64),
median_household_income FLOAT(64, 64),
median_monthly_owner_costs_with_mortgage FLOAT(64, 64),
median_monthly_owner_costs_no_mortgage FLOAT(64, 64),
median_gross_rent FLOAT(64, 64),
median_asking_price_for_sale_home_condo FLOAT(64, 64),
unemployment FLOAT(64, 64),
number_of_homes INTEGER,
count_of_abnb INTEGER,
density_of_abnb FLOAT(64, 64),
avg_abnb_price_by_zipcode FLOAT(64, 64),
avg_num_reviews_by_zipcode FLOAT(64, 64),
avg_rating_by_zipcode FLOAT(64, 64),
avg_num_bathrooms_by_zipcode FLOAT(64, 64),
avg_num_bedrooms_by_zipcode FLOAT(64, 64),
avg_num_beds_by_zipcode FLOAT(64, 64),
avg_num_guests_by_zipcode FLOAT(64, 64)
); """

load_data_query = f""" LOAD CSV 'data/Airbnb/missing_values/dirty_test1.csv' INTO AIRBNB_DATA5;"""
cursor.query(create_table_query).df()
cursor.query(load_data_query).df()
print("loaded data")


# data = pd.read_csv('data/Airbnb/missing_values/dirty_test1.csv')
data = pd.read_csv('cleaned_dfs/cleaned_df_int.csv')
# data = pd.read_csv('cleaned_df.csv')

#clean using llm
# remove duplicate rows.', \

# query = f""" SELECT ChatWithPandas('cleaning',\
# 'impute null values with average of the column if an integer or float. replace with an empty string if column is a string.',\
# Bathrooms, Bedrooms, Beds, Location_Name, Num_Guests, Num_Reviews, Price, Rating, latitude, longitude, zipcode, pop2016, pop2010, pop2000, cost_living_index, land_area, water_area, pop_density, number_of_males, number_of_females, prop_taxes_paid_2016, median_taxes_with_mortgage, median_taxes_no_mortgage, median_house_value, median_household_income, median_monthly_owner_costs_with_mortgage, median_monthly_owner_costs_no_mortgage, median_gross_rent, median_asking_price_for_sale_home_condo, unemployment, number_of_homes, count_of_abnb, density_of_abnb, avg_abnb_price_by_zipcode, avg_num_reviews_by_zipcode, avg_rating_by_zipcode, avg_num_bathrooms_by_zipcode, avg_num_bedrooms_by_zipcode, avg_num_beds_by_zipcode, avg_num_guests_by_zipcode) FROM AIRBNB_DATA5;
# """
# data = cursor.query(query).execute()
#clean ends here


# data = data.dropna()
# Identify categorical columns
categorical_cols = data.select_dtypes(include=['object']).columns

data = pd.get_dummies(data, columns=categorical_cols)
# data.dropna()

# Split features and labels
X = data.iloc[:, :-1].values
y = data.iloc[:, -1].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train = X_train.astype(float)
X_test = X_test.astype(float)
y_train = y_train.astype(float)
y_test = y_test.astype(float)

# Convert to torch tensors
X_train_tensor = torch.FloatTensor(X_train)
X_test_tensor = torch.FloatTensor(X_test)
y_train_tensor = torch.FloatTensor(y_train)
y_test_tensor = torch.FloatTensor(y_test)

# Define a simple logistic regression model
class LogisticRegression(nn.Module):
def __init__(self, input_dim):
super(LogisticRegression, self).__init__()
self.linear = nn.Linear(input_dim, 1)

def forward(self, x):
return torch.sigmoid(self.linear(x))

input_dim = X_train.shape[1]
model = LogisticRegression(input_dim)

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)

# Train the model
epochs = 50
for epoch in range(epochs):
model.train()
optimizer.zero_grad()
outputs = model(X_train_tensor).squeeze()
loss = criterion(outputs, y_train_tensor)
loss.backward()
optimizer.step()

# Compute accuracy
model.eval()
with torch.no_grad():
predictions = model(X_test_tensor).squeeze()
predictions = (predictions > 0.5).float()
correct = (predictions == y_test_tensor).float().sum()
accuracy = correct / len(y_test_tensor)
print("accuracyy:", accuracy)



# scaler = StandardScaler()
# X_train = scaler.fit_transform(X_train)
# X_test = scaler.transform(X_test)

# model = LogisticRegression()
# model.fit(X_train, y_train)

# y_pred = model.predict(X_test)

# accuracy = accuracy_score(y_test, y_pred)
# f1 = f1_score(y_test, y_pred)

# print(f"Accuracy: {accuracy:.2f}")
# print(f"F1 Score: {f1:.2f}")
51 changes: 51 additions & 0 deletions chat_runner_local.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
import os
import pandas as pd
import evadb

cursor = evadb.connect().cursor()
print("Connected to EvaDB")

# create_function_query = f"""CREATE FUNCTION IF NOT EXISTS ChatWithPandas
# IMPL './functions/semantic_cache.py';
# """
create_function_query = f"""CREATE FUNCTION IF NOT EXISTS ChatWithPandas
IMPL './functions/semantic_cache.py' use_local_llm 'True' local_llm_model "llama-2-7b-chat.ggmlv3.q4_0.bin" csv_path "./data/cars.csv";
"""
cursor.query("DROP FUNCTION IF EXISTS ChatWithPandas;").execute()
cursor.query(create_function_query).execute()
print("Created Function")

create_table_query = f"""
CREATE TABLE IF NOT EXISTS CARSDATA(
id INTEGER,
name TEXT(30),
mpg INTEGER,
cyl FLOAT(64,64),
disp FLOAT(64,64),
hp FLOAT(64,64),
drat FLOAT(64,64),
wt FLOAT(64,64),
qsec FLOAT(64,64),
vs FLOAT(64,64),
am FLOAT(64,64),
gear FLOAT(64,64),
carb FLOAT(64,64)
);
"""
load_data_query = f""" LOAD CSV 'data/cars.csv' INTO CARSDATA;
"""

cursor.query(create_table_query).execute()
cursor.query(load_data_query).execute()
print("loaded data")

chat_query1 = f""" SELECT ChatWithPandas('what is the mean of the gear column',gear, name) FROM CARSDATA;
"""

result1 = cursor.query(chat_query1).execute()
print(result1)

chat_query2 = f""" SELECT ChatWithPandas('which car has the highest gear value',gear, name) FROM CARSDATA;
"""
result2 = cursor.query(chat_query2).execute()
print(result2)
3 changes: 3 additions & 0 deletions config.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
class Config:
def __init__(self) -> None:
self.open_ai_key = ""
self.local_llm_model = "llama-2-7b-chat.ggmlv3.q4_0.bin"

def get_open_ai_key(self):
return self.open_ai_key
def get_local_llm_model(self):
return self.local_llm_model

2 changes: 2 additions & 0 deletions data/Airbnb/missing_values/dummy.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Bathrooms,Bedrooms,Beds,LocationName,NumGuests,NumReviews,Price,Rating,latitude,longitude,zipcode,pop2016,pop2010,pop2000,cost_living_index (US avg. = 100),land_area (sq.mi.),water_area (sq.mi.),pop_density (people per mile),number of males,number of females,prop taxes paid 2016,median taxes (with mortgage,median taxes (no mortgage),median house value,median houshold income,median monthly owner costs (with mortgage),median monthly owner costs (no mortgage),median gross rent,median asking price for vacant for-sale home/condo,unemployment (%),Number of Homes,Count of Abnb,Density of Abnb (%),Average Abnb Price (by zipcode),Average NumReviews (by zipcode),Average Rating (by zipcode),Average Number of Bathrooms (by zipcode),Average Number of Bedrooms (by zipcode),Average Number of Beds (by zipcode),Average Number of Guests (by zipcode)
3.0,4.0,5.0,Atlanta,10.0,19.0,795.0,Y,33.76088,-84.36917,30308,17280.0,15413.0,11796,98.0,1.6,0.0,10836,10075,7205,1.2,3155.0,2380.0,259718.0,59088.0,1713.0,665.0,1162.0,326958.0,4.6,6912.0,210,3.038194444,141.4285714,36.27329193,4.880794702,1.285714286,1.494680851,1.933333333,1.933333333
46 changes: 36 additions & 10 deletions datastructure/aidDataframe.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,12 @@
import pandas as pd
import openai
import subprocess
from gpt4all import GPT4All
from langchain.llms import OpenAI
from langchain.agents import create_pandas_dataframe_agent
from langchain.chat_models import ChatOpenAI
from langchain.agents.agent_types import AgentType
# from prompts.error_correction_prompt import ErrorCorrectionPrompt
from config import Config
import re
import os
Expand All @@ -11,6 +18,7 @@ def __init__(self, df, config=None, description=None, name=None) -> None:

#initialize pandas dataframe
self.pd_df = df
print("pd_df INITTT: \n", str(self.pd_df))
self.config = Config()

if len(df)>0:
Expand All @@ -26,7 +34,8 @@ def __init__(self, df, config=None, description=None, name=None) -> None:
self.config = config

#set name
self.name = name
if name:
self.name = name

@property
def col_count(self):
Expand Down Expand Up @@ -60,6 +69,12 @@ def initialize_middleware(self):
self.openai_model = "gpt-3.5-turbo"
return

def initialize_local_llm_model(self, local_llm=None):
if local_llm:
local_llm_model = local_llm
else:
local_llm_model = self.config.get_local_llm_model(local_llm_model)
return GPT4All(local_llm_model)

def create_query_prompt(self, query: str):
prompt = f"I need you to write a python3.8 program for the following dataframe. \
Expand Down Expand Up @@ -142,7 +157,7 @@ def execute_python(self, python_code: str, type: str):
file.write(python_code)

from tmp import pandas_query_function
answer = pandas_query_function(self.pd_df)
answer = query_dataframe(self.pd_df)

#delete file
os.remove("tmp.py")
Expand Down Expand Up @@ -184,7 +199,7 @@ def execute_python(self, python_code: str, type: str):



def query_dataframe(self, query: str):
def query_dataframe(self, query: str, use_local_llm=None, local_llm_model=None, csv_path=None):
"""A function used by user to query and get some values from the dataframe.

Args:
Expand All @@ -194,12 +209,22 @@ def query_dataframe(self, query: str):
A string format with the required answer
"""
prompt = self.create_query_prompt(query)

completion = openai.ChatCompletion.create(model="gpt-3.5-turbo", \
temperature=0.2, \
messages=[{"role": "user", "content": prompt}])

python_code = completion.choices[0].message.content
if use_local_llm:
print("USING LOCAL LLM")
local_llm = self.initialize_local_llm_model(local_llm=local_llm_model)
print("PROMPTT", prompt)
response = local_llm.generate(prompt)
print("RESPONSEEE", response)
if "```" in response:
python_code = response.split("```")[1].lstrip("python")
else:
python_code = response
else:
completion = openai.ChatCompletion.create(model="gpt-3.5-turbo", \
temperature=0.2, \
messages=[{"role": "user", "content": prompt}])

python_code = completion.choices[0].message.content
answer = self.execute_python(python_code, "query")

return f"Question is {query} and Answer is {answer}"
Expand Down Expand Up @@ -250,7 +275,6 @@ def manipulate_dataframe(self, manipulation_query):

def clean_dataframe(self, clean_instructions):
prompt = self.create_data_cleaning_prompt(clean_instructions)

completion = openai.ChatCompletion.create(model="gpt-3.5-turbo", \
temperature=0.2, \
messages=[{"role": "user", "content": prompt}])
Expand All @@ -260,6 +284,8 @@ def clean_dataframe(self, clean_instructions):
return answer







Expand Down
11 changes: 8 additions & 3 deletions functions/chat_with_df.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@

import pandas as pd
import subprocess
import os

from evadb.catalog.catalog_type import NdArrayType
Expand All @@ -13,7 +14,10 @@
class ChatWithPandas(AbstractFunction):

@setup(cacheable=False, function_type="FeatureExtraction", batchable=False)
def setup(self):
def setup(self, use_local_llm=False, local_llm_model=None, csv_path=None):
self.use_local_llm = use_local_llm
self.local_llm_model = local_llm_model
self.csv_path = csv_path
pass

@property
Expand Down Expand Up @@ -57,7 +61,8 @@ def forward(self, df: pd.DataFrame) -> pd.DataFrame:
response = "cleaned dataframe is saved to cleaned_df.csv"

if type == "query":
response = smart_df.query_dataframe(query)
print("passing local llm equals", self.use_local_llm)
response = smart_df.query_dataframe(query, self.use_local_llm, self.local_llm_model, self.csv_path)
elif type == "plot":
response = smart_df.plot_dataframe(query)
elif type == "manipulation":
Expand All @@ -68,4 +73,4 @@ def forward(self, df: pd.DataFrame) -> pd.DataFrame:

ans_df = pd.DataFrame(df_dict)
return pd.DataFrame(ans_df)

2 changes: 2 additions & 0 deletions run_test.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
export PYTHONPATH=$PWD
python3 -m unittest discover test/
Loading