diff --git a/README.md b/README.md deleted file mode 100644 index 4e55451..0000000 --- a/README.md +++ /dev/null @@ -1 +0,0 @@ -# pandas-ai-integration \ No newline at end of file diff --git a/analyse.py b/analyse.py new file mode 100644 index 0000000..9d12931 --- /dev/null +++ b/analyse.py @@ -0,0 +1,11 @@ +import pandas as pd + +dirty_test_df = pd.read_csv("/home/preethi/projects/pandas-ai-integration/data/Airbnb/missing_values/dirty_test.csv") +print("Dirty test len: ", len(dirty_test_df)) + +delete_test_df = pd.read_csv("/home/preethi/projects/pandas-ai-integration/data/Airbnb/missing_values/dirty_train.csv") +print("Dirty train len: ", len(delete_test_df)) + +cleaned_df = pd.read_csv("/home/preethi/projects/pandas-ai-integration/cleaned_df.csv") +print("Cleaned df: ", len(cleaned_df)) +# Get the number of columns diff --git a/benchmark.py b/benchmark.py new file mode 100644 index 0000000..94decf5 --- /dev/null +++ b/benchmark.py @@ -0,0 +1,167 @@ +import os +import pandas as pd +import evadb +import pandas as pd +import torch +import torch.nn as nn +import torch.optim as optim +from sklearn.model_selection import train_test_split +from sklearn.preprocessing import StandardScaler +from sklearn.linear_model import LogisticRegression +from sklearn.metrics import accuracy_score, f1_score + +cursor = evadb.connect().cursor() +print("Connected to EvaDB") +#local +# create_function_query = f"""CREATE FUNCTION IF NOT EXISTS ChatWithPandas +# IMPL './functions/chat_with_df.py' use_local_llm 'True' local_llm_model "llama-2-7b-chat.ggmlv3.q4_0.bin" csv_path "./data/cars.csv"; +# """ + +create_function_query = f"""CREATE FUNCTION IF NOT EXISTS ChatWithPandas + IMPL './functions/chat_with_df.py'; + """ + +cursor.query("DROP FUNCTION IF EXISTS ChatWithPandas;").execute() + +cursor.query(create_function_query).execute() +print("Created Function") + +create_table_query = f"""CREATE TABLE IF NOT EXISTS AIRBNB_DATA5( + bathrooms FLOAT(64, 64), + bedrooms FLOAT(64, 64), + beds FLOAT(64, 64), + location_name TEXT(255), + num_guests FLOAT(64, 64), + num_reviews FLOAT(64, 64), + price FLOAT(64, 64), + rating TEXT(225), + latitude FLOAT(64, 64), + longitude FLOAT(64, 64), + zipcode TEXT(10), + pop2016 FLOAT(64, 64), + pop2010 FLOAT(64, 64), + pop2000 FLOAT(64, 64), + cost_living_index FLOAT(64, 64), + land_area FLOAT(64, 64), + water_area FLOAT(64, 64), + pop_density INTEGER, + number_of_males INTEGER, + number_of_females INTEGER, + prop_taxes_paid_2016 FLOAT(64, 64), + median_taxes_with_mortgage FLOAT(64, 64), + median_taxes_no_mortgage FLOAT(64, 64), + median_house_value FLOAT(64, 64), + median_household_income FLOAT(64, 64), + median_monthly_owner_costs_with_mortgage FLOAT(64, 64), + median_monthly_owner_costs_no_mortgage FLOAT(64, 64), + median_gross_rent FLOAT(64, 64), + median_asking_price_for_sale_home_condo FLOAT(64, 64), + unemployment FLOAT(64, 64), + number_of_homes INTEGER, + count_of_abnb INTEGER, + density_of_abnb FLOAT(64, 64), + avg_abnb_price_by_zipcode FLOAT(64, 64), + avg_num_reviews_by_zipcode FLOAT(64, 64), + avg_rating_by_zipcode FLOAT(64, 64), + avg_num_bathrooms_by_zipcode FLOAT(64, 64), + avg_num_bedrooms_by_zipcode FLOAT(64, 64), + avg_num_beds_by_zipcode FLOAT(64, 64), + avg_num_guests_by_zipcode FLOAT(64, 64) +); """ + +load_data_query = f""" LOAD CSV 'data/Airbnb/missing_values/dirty_test1.csv' INTO AIRBNB_DATA5;""" +cursor.query(create_table_query).df() +cursor.query(load_data_query).df() +print("loaded data") + + +# data = pd.read_csv('data/Airbnb/missing_values/dirty_test1.csv') +data = pd.read_csv('cleaned_dfs/cleaned_df_int.csv') +# data = pd.read_csv('cleaned_df.csv') + +#clean using llm + # remove duplicate rows.', \ + +# query = f""" SELECT ChatWithPandas('cleaning',\ +# 'impute null values with average of the column if an integer or float. replace with an empty string if column is a string.',\ +# Bathrooms, Bedrooms, Beds, Location_Name, Num_Guests, Num_Reviews, Price, Rating, latitude, longitude, zipcode, pop2016, pop2010, pop2000, cost_living_index, land_area, water_area, pop_density, number_of_males, number_of_females, prop_taxes_paid_2016, median_taxes_with_mortgage, median_taxes_no_mortgage, median_house_value, median_household_income, median_monthly_owner_costs_with_mortgage, median_monthly_owner_costs_no_mortgage, median_gross_rent, median_asking_price_for_sale_home_condo, unemployment, number_of_homes, count_of_abnb, density_of_abnb, avg_abnb_price_by_zipcode, avg_num_reviews_by_zipcode, avg_rating_by_zipcode, avg_num_bathrooms_by_zipcode, avg_num_bedrooms_by_zipcode, avg_num_beds_by_zipcode, avg_num_guests_by_zipcode) FROM AIRBNB_DATA5; +# """ +# data = cursor.query(query).execute() +#clean ends here + + +# data = data.dropna() +# Identify categorical columns +categorical_cols = data.select_dtypes(include=['object']).columns + +data = pd.get_dummies(data, columns=categorical_cols) +# data.dropna() + +# Split features and labels +X = data.iloc[:, :-1].values +y = data.iloc[:, -1].values + +X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) + +X_train = X_train.astype(float) +X_test = X_test.astype(float) +y_train = y_train.astype(float) +y_test = y_test.astype(float) + +# Convert to torch tensors +X_train_tensor = torch.FloatTensor(X_train) +X_test_tensor = torch.FloatTensor(X_test) +y_train_tensor = torch.FloatTensor(y_train) +y_test_tensor = torch.FloatTensor(y_test) + +# Define a simple logistic regression model +class LogisticRegression(nn.Module): + def __init__(self, input_dim): + super(LogisticRegression, self).__init__() + self.linear = nn.Linear(input_dim, 1) + + def forward(self, x): + return torch.sigmoid(self.linear(x)) + +input_dim = X_train.shape[1] +model = LogisticRegression(input_dim) + +# Loss and optimizer +criterion = nn.CrossEntropyLoss() +optimizer = optim.SGD(model.parameters(), lr=0.01) + +# Train the model +epochs = 50 +for epoch in range(epochs): + model.train() + optimizer.zero_grad() + outputs = model(X_train_tensor).squeeze() + loss = criterion(outputs, y_train_tensor) + loss.backward() + optimizer.step() + +# Compute accuracy +model.eval() +with torch.no_grad(): + predictions = model(X_test_tensor).squeeze() + predictions = (predictions > 0.5).float() + correct = (predictions == y_test_tensor).float().sum() + accuracy = correct / len(y_test_tensor) + print("accuracyy:", accuracy) + + + +# scaler = StandardScaler() +# X_train = scaler.fit_transform(X_train) +# X_test = scaler.transform(X_test) + +# model = LogisticRegression() +# model.fit(X_train, y_train) + +# y_pred = model.predict(X_test) + +# accuracy = accuracy_score(y_test, y_pred) +# f1 = f1_score(y_test, y_pred) + +# print(f"Accuracy: {accuracy:.2f}") +# print(f"F1 Score: {f1:.2f}") diff --git a/chat_runner_local.py b/chat_runner_local.py new file mode 100644 index 0000000..dda25bc --- /dev/null +++ b/chat_runner_local.py @@ -0,0 +1,51 @@ +import os +import pandas as pd +import evadb + +cursor = evadb.connect().cursor() +print("Connected to EvaDB") + +# create_function_query = f"""CREATE FUNCTION IF NOT EXISTS ChatWithPandas +# IMPL './functions/semantic_cache.py'; +# """ +create_function_query = f"""CREATE FUNCTION IF NOT EXISTS ChatWithPandas + IMPL './functions/semantic_cache.py' use_local_llm 'True' local_llm_model "llama-2-7b-chat.ggmlv3.q4_0.bin" csv_path "./data/cars.csv"; + """ +cursor.query("DROP FUNCTION IF EXISTS ChatWithPandas;").execute() +cursor.query(create_function_query).execute() +print("Created Function") + +create_table_query = f""" +CREATE TABLE IF NOT EXISTS CARSDATA( +id INTEGER, +name TEXT(30), +mpg INTEGER, +cyl FLOAT(64,64), +disp FLOAT(64,64), +hp FLOAT(64,64), +drat FLOAT(64,64), +wt FLOAT(64,64), +qsec FLOAT(64,64), +vs FLOAT(64,64), +am FLOAT(64,64), +gear FLOAT(64,64), +carb FLOAT(64,64) +); +""" +load_data_query = f""" LOAD CSV 'data/cars.csv' INTO CARSDATA; +""" + +cursor.query(create_table_query).execute() +cursor.query(load_data_query).execute() +print("loaded data") + +chat_query1 = f""" SELECT ChatWithPandas('what is the mean of the gear column',gear, name) FROM CARSDATA; +""" + +result1 = cursor.query(chat_query1).execute() +print(result1) + +chat_query2 = f""" SELECT ChatWithPandas('which car has the highest gear value',gear, name) FROM CARSDATA; +""" +result2 = cursor.query(chat_query2).execute() +print(result2) \ No newline at end of file diff --git a/config.py b/config.py index 8905a18..6f4e6ec 100644 --- a/config.py +++ b/config.py @@ -1,7 +1,10 @@ class Config: def __init__(self) -> None: self.open_ai_key = "" + self.local_llm_model = "llama-2-7b-chat.ggmlv3.q4_0.bin" def get_open_ai_key(self): return self.open_ai_key + def get_local_llm_model(self): + return self.local_llm_model \ No newline at end of file diff --git a/data/Airbnb/missing_values/dummy.csv b/data/Airbnb/missing_values/dummy.csv new file mode 100644 index 0000000..833ec7f --- /dev/null +++ b/data/Airbnb/missing_values/dummy.csv @@ -0,0 +1,2 @@ +Bathrooms,Bedrooms,Beds,LocationName,NumGuests,NumReviews,Price,Rating,latitude,longitude,zipcode,pop2016,pop2010,pop2000,cost_living_index (US avg. = 100),land_area (sq.mi.),water_area (sq.mi.),pop_density (people per mile),number of males,number of females,prop taxes paid 2016,median taxes (with mortgage,median taxes (no mortgage),median house value,median houshold income,median monthly owner costs (with mortgage),median monthly owner costs (no mortgage),median gross rent,median asking price for vacant for-sale home/condo,unemployment (%),Number of Homes,Count of Abnb,Density of Abnb (%),Average Abnb Price (by zipcode),Average NumReviews (by zipcode),Average Rating (by zipcode),Average Number of Bathrooms (by zipcode),Average Number of Bedrooms (by zipcode),Average Number of Beds (by zipcode),Average Number of Guests (by zipcode) +3.0,4.0,5.0,Atlanta,10.0,19.0,795.0,Y,33.76088,-84.36917,30308,17280.0,15413.0,11796,98.0,1.6,0.0,10836,10075,7205,1.2,3155.0,2380.0,259718.0,59088.0,1713.0,665.0,1162.0,326958.0,4.6,6912.0,210,3.038194444,141.4285714,36.27329193,4.880794702,1.285714286,1.494680851,1.933333333,1.933333333 diff --git a/datastructure/aidDataframe.py b/datastructure/aidDataframe.py index ab940a7..79585df 100644 --- a/datastructure/aidDataframe.py +++ b/datastructure/aidDataframe.py @@ -1,5 +1,12 @@ import pandas as pd import openai +import subprocess +from gpt4all import GPT4All +from langchain.llms import OpenAI +from langchain.agents import create_pandas_dataframe_agent +from langchain.chat_models import ChatOpenAI +from langchain.agents.agent_types import AgentType +# from prompts.error_correction_prompt import ErrorCorrectionPrompt from config import Config import re import os @@ -11,6 +18,7 @@ def __init__(self, df, config=None, description=None, name=None) -> None: #initialize pandas dataframe self.pd_df = df + print("pd_df INITTT: \n", str(self.pd_df)) self.config = Config() if len(df)>0: @@ -26,7 +34,8 @@ def __init__(self, df, config=None, description=None, name=None) -> None: self.config = config #set name - self.name = name + if name: + self.name = name @property def col_count(self): @@ -60,6 +69,12 @@ def initialize_middleware(self): self.openai_model = "gpt-3.5-turbo" return + def initialize_local_llm_model(self, local_llm=None): + if local_llm: + local_llm_model = local_llm + else: + local_llm_model = self.config.get_local_llm_model(local_llm_model) + return GPT4All(local_llm_model) def create_query_prompt(self, query: str): prompt = f"I need you to write a python3.8 program for the following dataframe. \ @@ -142,7 +157,7 @@ def execute_python(self, python_code: str, type: str): file.write(python_code) from tmp import pandas_query_function - answer = pandas_query_function(self.pd_df) + answer = query_dataframe(self.pd_df) #delete file os.remove("tmp.py") @@ -184,7 +199,7 @@ def execute_python(self, python_code: str, type: str): - def query_dataframe(self, query: str): + def query_dataframe(self, query: str, use_local_llm=None, local_llm_model=None, csv_path=None): """A function used by user to query and get some values from the dataframe. Args: @@ -194,12 +209,22 @@ def query_dataframe(self, query: str): A string format with the required answer """ prompt = self.create_query_prompt(query) - - completion = openai.ChatCompletion.create(model="gpt-3.5-turbo", \ - temperature=0.2, \ - messages=[{"role": "user", "content": prompt}]) - - python_code = completion.choices[0].message.content + if use_local_llm: + print("USING LOCAL LLM") + local_llm = self.initialize_local_llm_model(local_llm=local_llm_model) + print("PROMPTT", prompt) + response = local_llm.generate(prompt) + print("RESPONSEEE", response) + if "```" in response: + python_code = response.split("```")[1].lstrip("python") + else: + python_code = response + else: + completion = openai.ChatCompletion.create(model="gpt-3.5-turbo", \ + temperature=0.2, \ + messages=[{"role": "user", "content": prompt}]) + + python_code = completion.choices[0].message.content answer = self.execute_python(python_code, "query") return f"Question is {query} and Answer is {answer}" @@ -250,7 +275,6 @@ def manipulate_dataframe(self, manipulation_query): def clean_dataframe(self, clean_instructions): prompt = self.create_data_cleaning_prompt(clean_instructions) - completion = openai.ChatCompletion.create(model="gpt-3.5-turbo", \ temperature=0.2, \ messages=[{"role": "user", "content": prompt}]) @@ -260,6 +284,8 @@ def clean_dataframe(self, clean_instructions): return answer + + diff --git a/functions/chat_with_df.py b/functions/chat_with_df.py index e950ef9..68473b9 100644 --- a/functions/chat_with_df.py +++ b/functions/chat_with_df.py @@ -1,5 +1,6 @@ import pandas as pd +import subprocess import os from evadb.catalog.catalog_type import NdArrayType @@ -13,7 +14,10 @@ class ChatWithPandas(AbstractFunction): @setup(cacheable=False, function_type="FeatureExtraction", batchable=False) - def setup(self): + def setup(self, use_local_llm=False, local_llm_model=None, csv_path=None): + self.use_local_llm = use_local_llm + self.local_llm_model = local_llm_model + self.csv_path = csv_path pass @property @@ -57,7 +61,8 @@ def forward(self, df: pd.DataFrame) -> pd.DataFrame: response = "cleaned dataframe is saved to cleaned_df.csv" if type == "query": - response = smart_df.query_dataframe(query) + print("passing local llm equals", self.use_local_llm) + response = smart_df.query_dataframe(query, self.use_local_llm, self.local_llm_model, self.csv_path) elif type == "plot": response = smart_df.plot_dataframe(query) elif type == "manipulation": @@ -68,4 +73,4 @@ def forward(self, df: pd.DataFrame) -> pd.DataFrame: ans_df = pd.DataFrame(df_dict) return pd.DataFrame(ans_df) - + diff --git a/run_test.sh b/run_test.sh new file mode 100644 index 0000000..5900f18 --- /dev/null +++ b/run_test.sh @@ -0,0 +1,2 @@ +export PYTHONPATH=$PWD +python3 -m unittest discover test/ diff --git a/test/a_test_chat_with_pandas_open_ai.py b/test/a_test_chat_with_pandas_open_ai.py new file mode 100644 index 0000000..aa7ff2a --- /dev/null +++ b/test/a_test_chat_with_pandas_open_ai.py @@ -0,0 +1,58 @@ +import unittest +import os +import pandas as pd +import evadb + +class TestEvaDBFunctions(unittest.TestCase): + + def setUp(self): + self.conn = evadb.connect() + self.cursor = self.conn.cursor() + print("Connected to EvaDB") + + create_function_query = f"""CREATE FUNCTION IF NOT EXISTS ChatWithPandas + IMPL './functions/semantic_cache.py'""" + self.cursor.query("DROP FUNCTION IF EXISTS ChatWithPandas;").execute() + self.cursor.query(create_function_query).execute() + print("Created Function") + + create_table_query = """ + CREATE TABLE IF NOT EXISTS CARSDATA( + id INTEGER, + name TEXT(30), + mpg INTEGER, + cyl FLOAT(64,64), + disp FLOAT(64,64), + hp FLOAT(64,64), + drat FLOAT(64,64), + wt FLOAT(64,64), + qsec FLOAT(64,64), + vs FLOAT(64,64), + am FLOAT(64,64), + gear FLOAT(64,64), + carb FLOAT(64,64) + ); + """ + load_data_query = """ LOAD CSV 'data/cars.csv' INTO CARSDATA; + """ + + self.cursor.query(create_table_query).execute() + self.cursor.query(load_data_query).execute() + print("Loaded data") + + def test_mean_of_gear_column(self): + chat_query = "SELECT ChatWithPandas('what is the mean of the gear column', gear, name) FROM CARSDATA;" + result = self.cursor.query(chat_query).execute() + self.assertIsNotNone(result) + + def test_highest_gear_value_car(self): + chat_query = "SELECT ChatWithPandas('which car has the highest gear value', gear, name) FROM CARSDATA;" + result = self.cursor.query(chat_query).execute() + self.assertIsNotNone(result) + + def tearDown(self): + self.cursor.close() + print("Closed EvaDB connection") + +if __name__ == '__main__': + unittest.main() diff --git a/test/test_chat_with_pandas_local_llm.py b/test/test_chat_with_pandas_local_llm.py new file mode 100644 index 0000000..63bea99 --- /dev/null +++ b/test/test_chat_with_pandas_local_llm.py @@ -0,0 +1,58 @@ +import unittest +import os +import pandas as pd +import evadb + +class TestEvaDBFunctions(unittest.TestCase): + + def setUp(self): + self.conn = evadb.connect() + self.cursor = self.conn.cursor() + print("Connected to EvaDB") + create_function_query = f"""CREATE FUNCTION IF NOT EXISTS ChatWithPandas + IMPL './functions/semantic_cache.py' use_local_llm 'True' local_llm_model "llama-2-7b-chat.ggmlv3.q4_0.bin"; + """ + self.cursor.query("DROP FUNCTION IF EXISTS ChatWithPandas;").execute() + self.cursor.query(create_function_query).execute() + print("Created Function") + + create_table_query = """ + CREATE TABLE IF NOT EXISTS CARSDATA( + id INTEGER, + name TEXT(30), + mpg INTEGER, + cyl FLOAT(64,64), + disp FLOAT(64,64), + hp FLOAT(64,64), + drat FLOAT(64,64), + wt FLOAT(64,64), + qsec FLOAT(64,64), + vs FLOAT(64,64), + am FLOAT(64,64), + gear FLOAT(64,64), + carb FLOAT(64,64) + ); + """ + load_data_query = """ LOAD CSV 'data/cars.csv' INTO CARSDATA; + """ + + self.cursor.query(create_table_query).execute() + self.cursor.query(load_data_query).execute() + print("Loaded data") + + def test_mean_of_gear_column(self): + chat_query = "SELECT ChatWithPandas('what is the mean of the gear column', gear, name) FROM CARSDATA;" + result = self.cursor.query(chat_query).execute() + print("RESULTT", result) + + def test_highest_gear_value_car(self): + chat_query = "SELECT ChatWithPandas('which car has the highest gear value', gear, name) FROM CARSDATA;" + result = self.cursor.query(chat_query).execute() + print("RESULTT", result) + + def tearDown(self): + self.cursor.close() + print("Closed EvaDB connection") + +if __name__ == '__main__': + unittest.main()