From 31399765aee5d70ab6428d0324886061c0e5e986 Mon Sep 17 00:00:00 2001 From: Preethi1609 Date: Wed, 27 Sep 2023 19:27:43 -0400 Subject: [PATCH 01/18] initial commit --- apps/data/eva_paper.txt | 1 + apps/pandas_qa_local.py | 143 +++++++++++++++++++++++++++++++ utils/chunk.py | 70 +++++++++++++++ utils/script.py | 185 ++++++++++++++++++++++++++++++++++++++++ utils/search_term.py | 114 +++++++++++++++++++++++++ 5 files changed, 513 insertions(+) create mode 100644 apps/data/eva_paper.txt create mode 100644 apps/pandas_qa_local.py create mode 100644 utils/chunk.py create mode 100644 utils/script.py create mode 100644 utils/search_term.py diff --git a/apps/data/eva_paper.txt b/apps/data/eva_paper.txt new file mode 100644 index 0000000..1dee18f --- /dev/null +++ b/apps/data/eva_paper.txt @@ -0,0 +1 @@ +EVA: An End-to-End Exploratory Video Analytics System Gaurav Tarlok Kakkar, Jiashen Cao, Pramod Chunduri, Zhuangdi Xu, Suryatej Reddy Vyalla, Prashanth Dintyala, Anirudh Prabakaran, Jaeho Bang, Aubhro Sengupta, Kaushik Ravichandran, Ishwarya Sivakumar, Aryan Rajoria, Ashmita Raju, Tushar Aggarwal, Abdullah Shah, Sanjana Garg, Shashank Suman, Myna Prasanna Kalluraya, † Subrata Mitra , Ali Payani‡ , Yao Lu★, Umakishore Ramachandran, Joy Arulraj Georgia Institute of Technology † Adobe, ‡ Cisco, ★Microsoft arulraj@gatech.edu Abstract In recent years, deep learning models have revolutionized computer vision, enabling diverse applications. However, these models are computationally expensive, and leveraging them for video analytics involves low-level imperative programming. To address these efficiency and usability challenges, the database community has developed video database management systems (VDBMSs). However, existing VDBMSs lack extensibility and composability and do not support holistic system optimizations, limiting their practical application. In response to these issues, we present our vision for EVA, a VDBMS that allows for extensible support of user-defined functions and employs a Cascades-style query optimizer. Additionally, we leverage Ray’s distributed execution to enhance scalability and performance and explore hardware-specific optimizations to facilitate runtime optimizations. We discuss the architecture and design of EVA, our achievements thus far, and our research roadmap. many libraries (e.g., PyTorch [28], OpenCV [5], and Pandas [26]) to leverage these vision models. To tackle these efficiency and usability challenges, database researchers have proposed several video database management systems (VDBMSs) [4, 9, 20, 22, 24, 34]. These systems improve usability by supporting declarative SQL-like queries over videos. VDBMSs have applications across several domains, including movie analysis, monitor wildlife behavior [12, 19], monitor traffic [41], analyze retail store performance [36]. For example, a movie analyst may issue the following query to study the emotion palette of actors in a movie dataset [23]: /* Movie Analysis */ SELECT E m o t i o n C l a s s i f i c a t i o n ( Crop ( data , bbox ) ) FROM MOVIE CROSS APPLY UNNEST ( FaceDetection ( data ) ) AS Face ( bbox , conf ) WHERE id < 1000 AND conf > 0.8; Listing 1: Illustrative EVAQL query ACM Reference Format: Gaurav Tarlok Kakkar, Jiashen Cao, Pramod Chunduri, Zhuangdi Xu, Suryatej Reddy Vyalla, Prashanth Dintyala, Anirudh Prabakaran, Jaeho Bang, Aubhro Sengupta, Kaushik Ravichandran, Ishwarya Sivakumar, Aryan Rajoria, Ashmita Raju, Tushar Aggarwal, Abdullah Shah, Sanjana Garg, Shashank Suman, Myna Prasanna Kalluraya,, Subrata Mitra† , Ali Payani‡ , Yao Lu★ , Umakishore Ramachandran, Joy Arulraj. 2023. EVA: An End-toEnd Exploratory Video Analytics System. In Data Management for End-toEnd Machine Learning (DEEM ’23), June 18, 2023, Seattle, WA, USA. ACM, New York, NY, USA, 5 pages. https://doi.org/10.1145/3595360.3595858 1 INTRODUCTION Advances in computer vision [11, 32] over the last decade has led to high interest among domain scientists and industry practitioners in leveraging vision models in their applications. However, there are efficiency and usability challenges associated with deploying vision pipelines in practice [20]. First, from a resource efficiency standpoint, these deep learning models are highly expensive to run on every frame of the video due to their depth (i.e., number of neural network layers). Second, from a usability standpoint, the domain scientist must do low-level imperative programming across This work is licensed under a Creative Commons Attribution International 4.0 License. DEEM ’23, June 18, 2023, Seattle, WA, USA © 2023 Copyright held by the owner/author(s). ACM ISBN 979-8-4007-0204-4/23/06. https://doi.org/10.1145/3595360.3595858 Here, the query invokes user-defined functions (UDFs) that wrap around vision models [29]. It first retrieves the bounding boxes of all the faces present in the initial 1000 frames of the MOVIE video using the FaceDetection UDF [35]. It filters out the faces for which the FaceDetection model has lower confidence (< 0.8). Next, it identifies the emotion of each confidently-detected face using EmotionClassification UDF. Prior Work. To efficiently process such queries, the state-of-theart (SoTA) VDBMSs use a suite of database-inspired optimizations. For instance, PP [24] trains a lightweight model to quickly filter out irrelevant frames (e.g., frames that are not likely to contain a person), and only runs the heavyweight models on a subset of frames that pass through the filter model. It reduces the query processing time and improves resource efficiency by reducing the number of invocations of the heavyweight oracle models. What do Existing Systems Lack? 1 Extensibility and Composability: They do not allow users to define their own user-defined functions (UDFs) for vision models, and lack the ability of compose UDFs to accomplish complex tasks (Listing 1). Furthermore, these VDBMSs mainly focus on queries over detected video objects and do not support richer vision queries like action localization [37]. 2 Holistic System Optimizations: Prior systems primarily focus on optimizing each query in isolation, even though workloads have significant overlapping computation (e.g., redundant inference using a vision model over the same frame across queries) [40]. They often use lightweight proxy models to accelerate query execution. DEEM ’23, June 18, 2023, Seattle, WA, USA Kakkar et. al. Parser (EVA Query Language) Query Optimizer (Cascades-style) LOAD VIDEO “movies/*.mp4” INTO MOVIES; SELECT id, FaceDetector(data).bboxes FROM MOVIES; Input Query Execution Engine (Derived Models, Ray, PyTorch, AQP) Storage Engine (Video + Derived Data Structures) Output EVA Figure 1: Architecture of EVA So, they do not support holistic optimization for more complex queries, both during query optimization and execution. These limitations significantly constrain the adoption of VDBMSs in practical applications. Raven [27] optimizes ML and relational pipelines with cross-query optimization. Gandhi et al. [16] utilizes tensor abstraction for trainable pipelines in AI and relational workloads. We plan to support the training pipeline in the future. Our Vision. To overcome existing limitations, we’re developing an innovative VDBMS that’s specifically designed for exploratory video analytics - EVA. EVA provides extensible support for UDFs(§ 3.1), allowing users to define bespoke UDFs based on their requirements and compose them with existing UDFs and operators to construct complex queries. For example, the FaceDetection and EmotionClassification models can be used to construct an emotion detection query. Additionally, UDFs can import third-party Python packages and execute arbitrary logic, which makes it easy for EVA to support new features in the future. To optimize query plans, EVA contains a Cascades-style query optimizer (§ 3.2) that leverages different forms of derived models and data structures. Like relational DBMSs, EVA estimates the cost of query plans by profiling operator costs and estimating predicate selectivity. It goes further by optimizing for query accuracy (§ 4.2). Moreover, EVA’s distributed Execution Engine powered by Ray (§ 3.3) provides additional scalability and performance. We’re also exploring hardware-specific optimizations and drawing inspiration from the adaptive query processing literature [13] to facilitate runtime optimizations (§ 4.3). 2 ARCHITECTURE of EVA The architecture of the EVA VDBMS is shown in Fig. 1. We first present the query language that the Parser supports. We then describe the internals of the other three components. 2.1 EVA Query Language (EVAQL) EVA’s parser supports a query language tailored for exploratory video analytics, called EVAQL. The queries in this section all concern a movie dataset. EVA stores all the videos of this dataset in the following table: MOVIE_DATA ( ID SERIAL INTEGER , VIDEO_ID INTEGER , VIDE O_FRAM E_ID INTEGER , VIDEO_NAME TEXT (30) , DATA NDARRAY UINT8 (3 , ANYDIM , ANYDIM ) ) ; Listing 2: Schema of the movie dataset Loading Data. EVA supports loading both videos and semi-structured data. The following query depicts how the user loads videos in EVA: /* Loading a video into the table */ LOAD VIDEO ' movies /*. mp4 ' INTO MOVIE_DATA ; EVA automatically creates a table called MOVIE_DATA with following columns: (1) id, (2) data, (3) video_id, (4) video_frame_id, and (5) video_name. They denote the frame identifier, the contents of the frame, and the video to which that frame belongs to. EVAQL supports queries for loading structured data (e.g., CSVs) for populating the metadata of videos (e.g., bounding boxes of faces in a frame). Similar to traditional DBMSs, the user must explicitly define the schema before loading the CSV file: /* Defining the schema and loading a CSV file */ CREATE TABLE IF NOT EXISTS M OVIE_M ETADAT A ( ID SERIAL INTEGER , VIDEO_ID INTEGER , VID EO_FRA ME_ID INTEGER , VIDEO_NAME TEXT (30) , FACE_BBOXES NDARRAY FLOAT32 (4) ) ; LOAD CSV ' movie . csv ' INTO MOVIE_ METADATA ; User-Defined Functions. EVAQL is tailored for supporting userdefined functions (UDFs). UDFs allow users to extend the VDBMS to support the requirements of their applications. In EVA, UDFs are often wrappers around deep learning models. For example, a face detection UDF takes a frame as input and returns the bounding boxes of the faces detected in the frame as output. Internally, it wraps around a FaceDetection PyTorch model [35]. EVAQL supports arbitrary UDFs that take a variety of inputs (e.g., video meta-data or raw frames etc.) and generate a variety of outputs (e.g., labels, bounding boxes, video frames, etc.). The following command registers a FaceDetection UDF in EVA: /* Registering a User - Defined Function */ CREATE UDF IF NOT EXISTS FaceDetector TYPE FaceDetection IMPL '/ udfs / face_detector . py ' PROPERTIES ( ' ACCURACY '= ' HIGH ') ; TYPE specifies the logical model type of the UDF (e.g., FaceDetection or ObjectDetection). IMPL specifies the path to the Python file containing the implementation of the UDF. Internally, EVA uses importlib for creating an importing UDF objects from the file [14]. The user can specify other metadata like the accuracy in PROPERTIES. EVA uses these properties to accelerate queries. For example, if the overall query accuracy requirement is moderate (e.g., 0.8× the oracle model), EVA uses faster (but less accurate) models of the same model type to accelerate the query. After registering the UDF, it can be executed on a video as shown in Listing 1. Interfaces. EVA currently supports EVAQL queries from both a command line interface and Jupyter notebooks. We seek to support a Pythonic dataframe API in the future. 2.2 Query Optimizer EVA’s Optimizer is based on the Cascades framework [17]. It applies a series of rules for rewriting the query and then performs cost-based optimization to generate a physical query plan . The Optimizer in a VDBMS differs from that in a relational DBMS in two ways. First, it must focus on minimizing query processing time while meeting the accuracy constraint (which often does not exist in a typical relational DBMS). Second, it is expensive to derive statistics from videos a priori as that involves running expensive deep learning models. So, while processing an ad-hoc query, the Optimizer runs vision models on a subset of frames to guide important optimization decisions (e.g., whether the query plan will meet the accuracy constraint or how should the predicates invoking vision models be ordered [22, 31, 40]). EVA: An End-to-End Exploratory Video Analytics System (a) A (b) DEEM ’23, June 18, 2023, Seattle, WA, USA A A A A (c) A UDF1(r) UDF2(r) UDF(r) UDF(r) A UDF1(r) ProxyUDF(r) UDF2(r) Figure 2: Illustrative UDF Optimization Rules – (a) UDF transformation rule that extracts the UDF from the predicate and converts to an APPLY operator, (b) UDF filtering rule that introduces a proxy UDF model for quickly filtering out irrelevant frames before executing UDF, and (c) UDF reordering rule that reorders UDFs based on their inference cost and availability of materialized results from prior queries. 2.3 Execution Engine The Execution Engine is responsible for evaluating the query plan generated by the Optimizer. While executing the plan, it leverages heterogeneous computational units (e.g., CPUs and GPUs). EVA leverages DL frameworks like PyTorch [28] for model inference. In an earlier prototype of EVA [40], the Execution Engine did not support distributed query execution. We have recently added support for distributed query execution (§ 3.3) using Ray [25]. 2.4 Storage Engine Lastly, the Storage Engine is responsible for managing the videos. In an earlier prototype of EVA [40], the Storage Engine organized the videos as a sequence of decoded frames, similar to SoTA VDBMSs [20]. However, this approach not only significantly increases the storage footprint of EVA on larger datasets but also does not provide any significant reduction in query execution time. We have subsequently redesigned the Storage Engine to manage videos in a compressed format. The Storage Engine manages structured data (e.g., bounding boxes of faces) on disk using the Parquet format [1]. It uses Arrow [30] as an in-memory columnar format for data that is being read or written using on-disk Parquet files. EVA supports defining UDFs using function decorators in Python. This allows users to migrate their existing deep learning models to EVA with a few lines of Python code. Users define the input and output formats of their models and configuration options through the decorator-based syntax. In Listing 3.1, the @setup decorator specifies the configuration options for the UDF. The user specifies the properties – whether EVA can cache results of UDF, does the UDF support batch mode execution, etc.. The @forward decorator specifies the input and output types/dimensions for the UDF. UDF from HuggingFace. Recently, HuggingFace [39] has gained popularity amongst the deep learning community for their support of various models across multiple data modalities (e.g., text, audio, video, etc.). EVA supports HuggingFace tasks and models right out of the box. Users define tasks or specify models using EVA’s declarative language: /* Registering an O b j e c t D e t e c t o r M o d e l */ CREATE UDF M y Ob j e c tD e t e ct o r TYPE HuggingFace PROPERTIES ( ' task '= ' object - detection ' , ' model '= ' facebook / detr - resnet - 50 ') Here, the user adds UDF that performs object-detection using the model facebook/detr-resnet-50. 3.2 3 PROGRESS We are implementing EVA as a Python package with Apache License [2] based on a client-server architecture [2]. We have made progress on enhancing the extensibility of EVA, and the efficacy of the Optimizer and the Execution Engine. 3.1 Extensibility - Importing UDFs EVA allows users to import their own UDFs in two ways. Users can either import their own implemented UDFs (i.e., from source) or from popular third party platform (e.g., HuggingFace [39], PyTorch). UDF from Source. # Configuring an UDF with decorators class I m a g e C l a s s i f i c a t i o n U D F : @setup ( cachable = True , batchable = True , udf_type = " I m a g e C l a s s i f i c a t i o n " ) def setup ( self ) : # prepare the UDF @forward ( i n pu t _ s ig n a t ur e s =[ PyTorchTensor ( type = NdArrayType . FLOAT32 , dimensions =(1 ,3 ,540 ,960) ) ] , o u t p u t_ s ig n a t u r e s =[ P an da s Da taf ra m e ( columns =[ " label " ] , column_types =[ NdArrayType . STR ]) ] ) def forward ( self ) : # do inference Query Optimizer - Reuse of Inference Results UDFs are often the most expensive operators in VDBMS queries. To accelerate such queries, EVA materializes the results of UDFs and reuses them while processing subsequent queries in exploratory video analytics [40]. Reusing results of UDFs in VDBMSs differs from the query plan matching algorithms in traditional DBMSs [3] that focus on expensive join operators. In contrast, in VDBMSs, UDFs frequently occur in predicates and projection lists. EVA’s optimizer supports novel general-purpose rewrite rules that are not present in SoTA VDBMSs. For example, to identify reuse opportunities, the Optimizer uses an UDF-centric rewrite rule (Fig. 2 (a)) that extracts the UDF from the predicate/projection expression and rewrites it using the CROSS APPLY operator [15]. The resulting query plan makes it feasible to explore rules like: (1) materializing and reusing results of the UDFs [40], (2) adding derived models (Fig. 2 (b)) [20, 21], (3) UDF reordering (Fig. 2 (c)), (4) UDF deduplication, and (5) introducing a video sampling operator before the UDF. Here, UDF de-duplication refers to avoiding redundant computation of a UDF that occurs multiple times in a single query. For example, if both the UDFs in the left hand side query tree in Fig. 2(c) are identical, we merge them into a single apply operator. 3.3 Execution Engine - Integrating Ray Our primary objective in integrating Ray into EVA is to support distributed query execution. We seek to initially support intraquery parallelism [18]. Consider a query that involves running DEEM ’23, June 18, 2023, Seattle, WA, USA Kakkar et. al. Exch dop=1,gpu=1 πEmotionClassifier Ainner R FaceDetector(r) π EmotionClassifier Xform to Exchange Plan Exchdop=2,gpu=1 Ainner R FaceDetector(r) Ainner R FaceDetector(r) Figure 3: Illustration of Exchange Operator — This query retrieves the emotions of all the faces detected in the video. the FaceDetection on a movie video with 13 K frames using a server with two GPUs. With a single GPU, it takes 402 s to process the query. Using Ray, EVA automatically splits the video into two partitions and uses both GPUs for model inference, reducing the query processing time to 209 s. Besides data-level parallelism, EVA also supports parallel processing of complex query predicates. For example, to evaluate: “UDF1(a) < 10 AND UDF2(b) > 20”, the VDBMS may either evaluate the two atomic predicates in parallel, or perform canonical predicate reordering and short-circuit the predicate evaluation. Exchange Operator. The Optimizer uses the exchange operator [6] to encapsulate the degree of parallelism (dop) in the query plan. The exchange operator splits the plan into two stages and configures the parallelism of the lower stage. Consider the query plan shown in Fig. 3. First, as specified by the lower exchange operator, two processes will run the FaceDetection UDF on the video. Then, the upper exchange operator indicates that a single process should run the EmotionClassification UDF on the bounding boxes of the detected faces. To leverage Ray, the Optimizer in EVA transforms the query plan into Ray actors and chains them via Ray queues. 4 ROADMAP We next describe our ongoing work and open questions in implementing EVA. We seek to continue improving the usability of EVA, and also the efficacy of the Optimizer and the Execution Engine. 4.1 Extensibility - Enhancing Querying Capability Action Queries. In our prior work in Zeus [8], we emphasized the need to improve the querying capabilities of VDBMSs to encompass action queries. Zeus assumes the availability of a vision model explicitly trained for the target action (e.g., a person riding a motorcycle). However, in real-world applications the action may rarely occur in the dataset, leading to insufficient true positive examples (i.e., class imbalance) during training. In addition, the number of ad-hoc combinations of objects and their interactions that form the actions is exponential. To overcome these challenges, we seek to pursue a more practical approach in EVA. We are investigating techniques to break ad-hoc actions into a collection of spatio-temporal predicates over the bounding boxes and the trajectories of objects across a sequence of frames [10, 33]. Similarity Search. To meet the needs of real-world applications [38], we seek to support object re-identification and similarity search queries in EVA. Consider a query that retrieves all the frames in a movie that contain a target actor. Efficiently searching for the specific actor using a target image requires the use of computationally expensive object re-identification models. We are currently investigating the integration of incremental search techniques into EVA’s Optimizer to accelerate re-identification queries. 4.2 Query Optimizer - Accuracy-Guided Optimization As in relational DBMSs, the VDBMS’s Optimizer estimates the query plan’s cost by profiling the cost of the operators and estimating the selectivity of predicates. However, there are two key differences. First, deep learning models are not always accurate. So, unlike relational DBMSs, VDBMSs cannot guarantee accurate results. This gives the Optimizer an opportunity to jointly optimize the query plan for both runtime performance and accuracy constraints. Second, the Optimizer must not treat an UDF as a black box. Instead, it should exploit the semantic properties of UDFs. For example, the Optimizer in EVA has the flexibility to pick a suitable physical model for processing a logical vision task, as long as it meets the query’s accuracy constraint. In our prior work [7], we showed how the Optimizer may dynamically pick different models for processing video chunks of varying complexity. We are investigating how to extend the Cascades-style Optimizer in EVA to jointly optimize for query execution cost and query accuracy. We seek to support complex model pipelines – proxy models, model cascades, and model ensembles. 4.3 Execution Engine - GPU-aware Optimization Resource utilization. As EVA extensively uses GPUs for query processing, it is critical to optimize query execution on GPUs. The Optimizer needs to insert the exchange operator and tune the degree-of-parallelism (DOP) parameter. The optimal DOP value depends on the model execution cost, the overall query, and the underlying data. We are investigating how to optimize this critical parameter to better leverage GPUs. Concretely, given the number of GPUs and their computational capabilities, EVA must decide where to inject the exchange operators in the query plan, and what is the suitable degree of parallelism for each operator. To achieve this, the Optimizer first generates a statically optimized plan. Later, it leverages the adaptive Execution Engine by adjusting the pipeline dynamically during execution to reduce overall processing time. Minimize data transfer cost. In queries with multiple UDFs, the same input frames may be transferred to the GPU multiple times (from the CPU) during query execution. Second, EVA only has CPU implementations of certain operators like join, predicate filtering, and cropping. That results in data transfer between CPU and GPU between different operators (e.g., 10-GB additional data movement for the query shown in Listing 1. To minimize this cost, we seek to investigate two optimizations: (1) lazy eviction and (2) operator fusion. First, with lazy eviction, the Execution Engine caches the frames on GPU if they are required by later operators in the query pipeline. Second, with operator fusion, we plan to add GPU-centric implementations of general-purpose operators (e.g., join and image cropping) to reduce data movement overhead. 5 CONCLUSION In this paper, we present our vision, current progress, and road map for future improvements on EVA, focusing on querying capability, query optimization, and query execution. We hope that EVA will enable a broader set of application developers to leverage recent advances in vision for analysing unstructured data. EVA: An End-to-End Exploratory Video Analytics System References [1] Apache Parquet. https://parquet.apache.org/. [2] EVA Video Database System. https://pypi.org/project/evadb/. [3] S. R. Alekh Jindal, Konstantions Karanasos and H. Patel. Selecting Subexpressions to Materialize at Datacenter Scale. In VLDB, 2018. [4] F. Bastani, S. He, A. Balasingam, K. Gopalakrishnan, M. Alizadeh, H. Balakrishnan, M. Cafarella, T. Kraska, and S. Madden. MIRIS: Fast Object Track Queries in Video. In SIGMOD, pages 1907–1921, 2020. [5] G. Bradski. The OpenCV Library. Dr. Dobb’s Journal of Software Tools, 2000. [6] E. Brewer. Volcano & the Exchange Operator, 2022. [7] J. Cao, K. Sarkar, R. Hadidi, J. Arulraj, and H. Kim. FiGO: Fine-Grained Query Optimization in Video Analytics. In SIGMOD, pages 559–572, 2022. event-place: Philadelphia, PA, USA. [8] P. Chunduri, J. Bang, Y. Lu, and J. Arulraj. Zeus: Efficiently Localizing Actions in Videos Using Reinforcement Learning. In SIGMOD, pages 545–558, 2022. [9] M. Daum, B. Haynes, D. He, A. Mazumdar, M. Balazinska, and A. Cheung. TASM: A Tile-Based Storage Manager for Video Analytics. ArXiv, abs/2006.02958, 2020. [10] M. Daum, E. Zhang, D. He, M. Balazinska, B. Haynes, R. Krishna, A. Craig, and A. Wirsing. VOCAL: Video Organization and Interactive Compositional AnaLytics. In CIDR, 2022. [11] J. Dean, D. Patterson, and C. Young. A new golden age in computer architecture: Empowering the machine-learning revolution. MICRO, 38(2):21–29, 2018. Publisher: IEEE. [12] J. Dellinger, C. Shores, A. Craig, S. Kachel, M. Heithaus, W. Ripple, and A. Wirsing. Predators reduce niche overlap between sympatric prey. Oikos, 12 2021. [13] A. Deshpande, Z. Ives, V. Raman, et al. Adaptive query processing. Foundations and Trends in Databases, 1(1):1–140, 2007. [14] P. S. Foundation. Importlib - the implementation of import, 2022. [15] C. Galindo-Legaria and M. Joshi. Orthogonal optimization of subqueries and aggregation. In SIGMOD ’01, 2001. [16] A. Gandhi, Y. Asada, V. Fu, A. Gemawat, L. Zhang, R. Sen, C. Curino, J. CamachoRodríguez, and M. Interlandi. The Tensor Data Platform: Towards an AI-centric Database System. CIDR, 2023. [17] G. Graefe. The Cascades Framework for Query Optimization. IEEE Data Eng. Bull., 18(3):19–29, 1995. [18] N. Hardavellas and I. Pandis. Intra-Query Parallelism, pages 1567–1568. Springer US, Boston, MA, 2009. [19] M. Heithaus, L. Dill, G. Marshall, and B. Buhleier. Habitat use and foraging behavior of tiger sharks (galeocerdo cuvier) in a seagrass ecosystem. Marine Biology, 140(2):237–248, 2002. [20] D. Kang, P. Bailis, and M. Zaharia. BlazeIt: Optimizing Declarative Aggregation and Limit Queries for Neural Network-Based Video Analytics. Proc. VLDB Endow., 13:533–546, 2019. [21] D. Kang, J. Emmons, F. Abuzaid, P. Bailis, and M. Zaharia. NoScope: Optimizing Neural Network Queries over Video at Scale. VLDB, 10(11):1586–1597, Aug. 2017. Publisher: VLDB Endowment. [22] D. Kang, F. Romero, P. D. Bailis, C. Kozyrakis, and M. Zaharia. VIVA: an end-toend system for interactive video analytics. In CIDR, 2022. [23] G. Liu, H. Shi, A. Kiani, A. Khreishah, J. Lee, N. Ansari, C. Liu, and M. M. Yousef. Smart Traffic Monitoring System using Computer Vision and Edge Computing. IEEE Transactions on Intelligent Transportation Systems, 2021. Publisher: IEEE. [24] Y. Lu, A. Chowdhery, S. Kandula, and S. Chaudhuri. Accelerating Machine Learning Inference with Probabilistic Predicates. SIGMOD, 2018. [25] P. Moritz, R. Nishihara, S. Wang, A. Tumanov, R. Liaw, E. Liang, M. Elibol, Z. Yang, W. Paul, M. I. Jordan, and I. Stoica. Ray: A distributed framework for emerging AI applications. In 13th USENIX Symposium on Operating Systems Design and Implementation (OSDI 18), pages 561–577, Carlsbad, CA, Oct. 2018. USENIX Association. [26] Pandas. pandas-dev/pandas: Pandas, Feb. 2020. [27] K. Park, K. Saur, D. Banda, R. Sen, M. Interlandi, and K. Karanasos. End-toend Optimization of Machine Learning Prediction Queries. In SIGMOD, pages 587–601, 2022. [28] A. Paszke, S. Gross, F. Massa, A. Lerer, J. Bradbury, G. Chanan, T. Killeen, Z. Lin, N. Gimelshein, L. Antiga, A. Desmaison, A. Köpf, E. Yang, Z. DeVito, M. Raison, A. Tejani, S. Chilamkurthy, B. Steiner, L. Fang, J. Bai, and S. Chintala. PyTorch: An Imperative Style, High-Performance Deep Learning Library. In NeurIPS, 2019. [29] A. Rheinländer, U. Leser, and G. Graefe. Optimization of complex dataflows with user-defined functions. ACM Computing Surveys (CSUR), 50(3):1–39, 2017. Publisher: ACM New York, NY, USA. [30] N. Richardson, I. Cook, N. Crane, D. Dunnington, R. François, J. Keane, D. Moldovan-Grünfeld, J. Ooms, and Apache Arrow. arrow: Integration to Apache Arrow, 2022. https://github.com/apache/arrow/, https://arrow.apache.org/docs/r/. [31] F. Romero, J. Hauswald, A. Partap, D. Kang, M. Zaharia, and C. Kozyrakis. Optimizing video analytics with declarative model relationships. Proc. VLDB Endow., 16(3):447–460, 2022. DEEM ’23, June 18, 2023, Seattle, WA, USA [32] O. Russakovsky, J. Deng, H. Su, J. Krause, S. Satheesh, S. Ma, Z. Huang, A. Karpathy, A. Khosla, and M. Bernstein. Imagenet large scale visual recognition challenge. IJCV, 115(3):211–252, 2015. Publisher: Springer. [33] M. A. Sakr and R. H. Güting. Spatiotemporal pattern queries. GeoInformatica, 15(3):497–540, 2011. [34] M. Satyanarayanan, P. B. Gibbons, L. B. Mummert, P. Pillai, P. Simoens, and R. Sukthankar. Cloudlet-based just-in-time indexing of iot video. In Global Internet of Things Summit, GIoTS 2017, Geneva, Switzerland, June 6-9, 2017, pages 1–8. IEEE, 2017. [35] F. Schroff, D. Kalenichenko, and J. Philbin. Facenet: A unified embedding for face recognition and clustering. In 2015 IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pages 815–823, 2015. [36] A. W. Senior, L. M. Brown, A. Hampapur, C. Shu, Y. Zhai, R. S. Feris, Y. Tian, S. Borger, and C. R. Carlson. Video analytics for retail. In AVSS, pages 423–428. IEEE Computer Society, 2007. [37] Z. Shou, D. Wang, and S.-F. Chang. Temporal action localization in untrimmed videos via multi-stage cnns. In Proceedings of the IEEE conference on computer vision and pattern recognition, pages 1049–1058, 2016. [38] T. Skopal, F. Falchi, J. Lokoc, M. L. Sapino, I. Bartolini, and M. Patella, editors. Similarity Search and Applications - 15th International Conference, SISAP 2022, Bologna, Italy, October 5-7, 2022, Proceedings, volume 13590 of Lecture Notes in Computer Science. Springer, 2022. [39] T. Wolf, L. Debut, V. Sanh, J. Chaumond, C. Delangue, A. Moi, P. Cistac, T. Rault, R. Louf, M. Funtowicz, J. Davison, S. Shleifer, P. von Platen, C. Ma, Y. Jernite, J. Plu, C. Xu, T. L. Scao, S. Gugger, M. Drame, Q. Lhoest, and A. M. Rush. Transformers: State-of-the-art natural language processing. In Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing: System Demonstrations, pages 38–45, Online, Oct. 2020. Association for Computational Linguistics. [40] Z. Xu, G. T. Kakkar, J. Arulraj, and U. Ramachandran. EVA: A Symbolic Approach to Accelerating Exploratory Video Analytics with Materialized Views. In SIGMOD, pages 602–616, 2022. [41] S. Yang, E. Bailey, Z. Yang, J. Ostrometzky, G. Zussman, I. Seskar, and Z. Kostic. COSMOS smart intersection: Edge compute and communications for bird’s eye object tracking. In PerCom, pages 1–7. IEEE, 2020. \ No newline at end of file diff --git a/apps/pandas_qa_local.py b/apps/pandas_qa_local.py new file mode 100644 index 0000000..2859f86 --- /dev/null +++ b/apps/pandas_qa_local.py @@ -0,0 +1,143 @@ +# coding=utf-8 +# Copyright 2018-2023 EvaDB +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os + +from gpt4all import GPT4All +import shutil +import subprocess +from typing import Dict + + +import pandas as pd + +APP_SOURCE_DIR = os.path.abspath(os.path.dirname(__file__)) +CURRENT_WORKING_DIR = os.getcwd() # used to locate evadb_data dir + +# default file paths +DEFAULT_TEXT_FILE_PATH = os.path.join(APP_SOURCE_DIR, "data", "eva_paper.txt") +MAX_CHUNK_SIZE = 2000 + +def receive_user_input() -> Dict: + """Receives user input. + + Returns: + user_input (dict): global configurations + """ + print( + "🔮 Welcome to EvaDB! This app lets you to run data analytics on a csv file like in a conversational manner.\nYou will only need to supply a path to csv file and an OpenAI API key.\n\n" + ) + user_input = dict() + + text_file_path = str( + input("📋 Enter the text file path (press Enter to use our default text file): ") + ) + + if text_file_path == "": + text_file_path = DEFAULT_TEXT_FILE_PATH + user_input["text_file_path"] = text_file_path + + return user_input + +def generate_script(df: pd.DataFrame, question: str) -> str: + """Generates script with llm. + + Args: + question (str): question to ask to llm. + + Returns + str: script generated by llm. + """ + # generate summary + all_columns = list(df) # Creates list of all column headers + df[all_columns] = df[all_columns].astype(str) + + prompt = f"""There is a dataframe in pandas (python). The name of the + dataframe is df. This is the result of print(df.head()): + {str(df.head())}\nAssuming the dataframe is already loaded and named 'df'. Do not include pd.read_csv, do not write code to load the CSV file. Return a python script to get the answer to a question. + Question : {question}. """ + + llm = GPT4All("llama-2-7b-chat.ggmlv3.q4_0.bin") + + script_body = llm.generate(prompt) + script_body = script_body.split("```")[1].lstrip("python") + return script_body + + + +def cleanup(): + """Removes any temporary file / directory created by EvaDB.""" + if os.path.exists("evadb_data"): + shutil.rmtree("evadb_data") + +def split_text_into_chunkss(text, max_chunk_size=MAX_CHUNK_SIZE): + chunks = [] + current_chunk = "" + + for line in text.splitlines(): + if len(current_chunk) + len(line) + 1 <= max_chunk_size: + # Add line to the current chunk + if current_chunk: + current_chunk += '\n' + current_chunk += line + else: + # Start a new chunk + chunks.append(current_chunk) + current_chunk = line + + if current_chunk: + chunks.append(current_chunk) + + return chunks + +def split_text_into_chunks(text, max_chunk_size=MAX_CHUNK_SIZE): + chunks = [] + start = 0 + end = max_chunk_size + + while start < len(text): + chunk = text[start:end] + chunks.append(chunk) + start = end + end += max_chunk_size + + return chunks + +if __name__ == "__main__": + # try: + # receive input from user + user_input = receive_user_input() + df = pd.read_csv(user_input["text_file_path"], names=['text']) + with open("/home/preethi/projects/pandas-ai-integration/apps/data/eva_paper.txt", 'r') as file: + file_contents = file.read() + + # Split the contents into chunks + text_chunks = split_text_into_chunks(file_contents) + chunked_output_file = "data_chunks.txt" + i = 0 + with open(chunked_output_file, 'w') as chunked_file: + for chunk in text_chunks: + i = i + 1 + chunk = chunk + '\n\n\n\n\n\n\n' + chunked_file.write(chunk) + print("chunk " + str(i)) + + print(f"Text chunks saved to {chunked_output_file}") + print("here1") + llm = GPT4All("llama-2-7b-chat.ggmlv3.q4_0.bin") + summaries = [] + for chunk in text_chunks: + summaries.append(llm.generate("Summarize this text" + chunk)) + print("SUMMARRYYYYYY", summaries) + \ No newline at end of file diff --git a/utils/chunk.py b/utils/chunk.py new file mode 100644 index 0000000..5541fb6 --- /dev/null +++ b/utils/chunk.py @@ -0,0 +1,70 @@ +import pandas as pd +from evadb.catalog.catalog_type import ColumnType +from evadb.functions.abstract.abstract_function import AbstractFunction +from evadb.functions.decorators.decorators import forward, setup +from evadb.functions.decorators.io_descriptors.data_types import PandasDataframe + +class Chunk(AbstractFunction): + """ + Arguments: + None + + Input Signatures: + input_dataframe (DataFrame) : A DataFrame containing a column of strings. + + Output Signatures: + output_dataframe (DataFrame) : A DataFrame containing chunks of strings. + + Example Usage: + You can use this function to concatenate strings in a DataFrame and split them into chunks. + """ + + @property + def name(self) -> str: + return "Chunk" + + @setup(cacheable=False) + def setup(self) -> None: + # Any setup or initialization can be done here if needed + pass + + @forward( + input_signatures=[ + PandasDataframe( + columns=["text"], + column_types=[ColumnType.TEXT], + column_shapes=[(None,)], + ) + ], + output_signatures=[ + PandasDataframe( + columns=["chunks"], + column_types=[ColumnType.TEXT], + column_shapes=[(None,)], + ) + ], + ) + def forward(self, input_dataframe): + # Ensure input is provided + if input_dataframe.empty: + raise ValueError("Input DataFrame must not be empty.") + + # Define the maximum number of tokens per chunk + max_tokens_per_chunk = 100 # Adjust this value as needed + + # Initialize lists for the output DataFrame + output_strings = [] + + # Iterate over rows of the input DataFrame + for _, row in input_dataframe.iterrows(): + input_string = row["text"] + + # Split the input string into chunks of maximum tokens + chunks = [input_string[i:i + max_tokens_per_chunk] for i in range(0, len(input_string), max_tokens_per_chunk)] + + output_strings.extend(chunks) + + # Create a DataFrame with the output strings + output_dataframe = pd.DataFrame({"chunks": output_strings}) + + return output_dataframe \ No newline at end of file diff --git a/utils/script.py b/utils/script.py new file mode 100644 index 0000000..604c562 --- /dev/null +++ b/utils/script.py @@ -0,0 +1,185 @@ +import json +import csv +import os +import evadb +import pandas as pd + +# Specify the directory containing your JSON files and the desired CSV file name +JSON_DIRECTORY = "./atlanta" +PROJECT_NAME = "postgres" + +CSV_FILE_PATH = f'{PROJECT_NAME}.csv' + +# Initialize an empty list to store the combined data from all JSON files +combined_data = [] + +# Iterate through each JSON file in the directory +for filename in os.listdir(JSON_DIRECTORY): + if filename.endswith('.json'): + json_file_path = os.path.join(JSON_DIRECTORY, filename) + + # Open the JSON file for reading + with open(json_file_path, 'r', encoding='utf-8') as json_input_file: + # Load the JSON data from the file + json_data = json.load(json_input_file) + for json_obj in json_data: + json_obj['date'] =\ + os.path.basename(str(json_file_path)) + + # Append the JSON data to the combined_data list + combined_data.extend(json_data) + +# Specify the headers for your CSV file based on the keys present in the JSON data +# This will ensure that only common keys across all JSON objects are included +csv_headers = list(set().union(*(d.keys() for d in combined_data))) + +# Open the CSV file for writing +with open(CSV_FILE_PATH, 'w', newline='', encoding='utf-8') as csv_output_file: + # Create a CSV writer + csv_writer = csv.DictWriter(csv_output_file, fieldnames=csv_headers) + + # Write the headers to the CSV file + csv_writer.writeheader() + + # Write the combined JSON data to the CSV file + csv_writer.writerows(combined_data) + +print(f'Conversion from JSON to CSV complete. Data saved to {CSV_FILE_PATH}') + +# Specify the input CSV file and output CSV file +input_csv_file = CSV_FILE_PATH +output_csv_file = CSV_FILE_PATH + +# Define the old and new column names +old_column_name = 'metadata' +new_column_name = 'metadata_slack' + +# Read the input CSV file and create a list of rows +with open(input_csv_file, 'r', newline='', encoding='utf-8') as input_file: + # Create a CSV reader + csv_reader = csv.reader(input_file) + + # Read the header row + header = next(csv_reader) + + # Find the index of the old column name in the header + try: + old_index = header.index(old_column_name) + except ValueError: + # Handle the case where the old column name is not found in the header + print(f'Column name "{old_column_name}" not found in the header.') + exit(1) + + # Update the header with the new column name + header[old_index] = new_column_name + + # Read the rest of the rows + rows = list(csv_reader) + +# Write the modified CSV data to the output file +with open(output_csv_file, 'w', newline='', encoding='utf-8') as output_file: + # Create a CSV writer + csv_writer = csv.writer(output_file) + + # Write the updated header + csv_writer.writerow(header) + + # Write the rest of the rows + csv_writer.writerows(rows) + +print(f'Column name "{old_column_name}" has been changed to "{new_column_name}" in {output_csv_file}') + +if __name__ == "__main__": + try: + # establish evadb api cursor + print("⏳ Establishing evadb connection...") + cursor = evadb.connect().cursor() + print("✅ evadb connection setup complete!") + + print(f'{CSV_FILE_PATH}') + + cursor.query(f"DROP FUNCTION IF EXISTS Chunk;").df() + + cursor.query(f""" + CREATE FUNCTION Chunk + INPUT (text TEXT(1000)) + OUTPUT (chunks TEXT(1000)) + TYPE StringProcessing + IMPL 'chunk.py'; + """).df() + + cursor.query(f"DROP FUNCTION IF EXISTS Contains;").df() + + cursor.query(f""" + CREATE FUNCTION Contains + INPUT (input_string TEXT(1000), substring TEXT(1000)) + OUTPUT (contains BOOLEAN) + TYPE StringProcessing + IMPL 'contains.py'; + """).df() + + cursor.query(f"DROP TABLE IF EXISTS SlackCSV;").df() + + cursor.query(f"""CREATE TABLE SlackCSV( + blocks TEXT(1000), + user_profile TEXT(1000), + reply_count TEXT(1000), + edited TEXT(1000), + user TEXT(1000), + username TEXT(1000), + bot_id INTEGER, + text TEXT(1000), + user_team TEXT(1000), + replies TEXT(1000), + icons TEXT(1000), + hidden TEXT(1000), + delete_original TEXT(1000), + pinned_to TEXT(1000), + latest_reply TEXT(1000), + old_name TEXT(1000), + team TEXT(1000), + reply_users TEXT(1000), + metadata_slack TEXT(1000), + replace_original TEXT(1000), + subscribed TEXT(1000), + reply_users_count TEXT(1000), + parent_user_id TEXT(1000), + thread_ts TEXT(1000), + attachments TEXT(1000), + subtype TEXT(1000), + last_read TEXT(1000), + client_msg_id TEXT(1000), + bot_profile TEXT(1000), + reactions TEXT(1000), + files TEXT(1000), + name TEXT(1000), + inviter TEXT(1000), + upload TEXT(1000), + type TEXT(1000), + ts TEXT(1000), + purpose TEXT(1000), + source_team TEXT(1000), + date TEXT(1000) + ); + """).df() + + cursor.query(f"LOAD CSV '{CSV_FILE_PATH}' INTO SlackCSV;").df() + + pd.set_option('display.max_columns', None) # Show all columns + pd.set_option('display.expand_frame_repr', False) + pd.set_option('display.max_colwidth', None) + print("here1") + # execute a select query + select_query = cursor.query( + """SELECT Chunk(text) + FROM SlackCSV + WHERE _row_id < 100 AND Contains(text, "predict") = "True"; + """).df() + print("here2") + print(select_query) + + except Exception as e: + print("❗️ Session ended with an error.") + print(e) + +exit(0) diff --git a/utils/search_term.py b/utils/search_term.py new file mode 100644 index 0000000..b372237 --- /dev/null +++ b/utils/search_term.py @@ -0,0 +1,114 @@ +import os +import subprocess +import sys +import openai +import evadb + +# Replace 'your-api-key' with your OpenAI API key +openai.api_key = "sk-xx" + +MAX_CHUNK_SIZE=15000 + +# Check if the search term argument is provided +if len(sys.argv) != 2: + print("Usage: python script.py ") + sys.exit(1) + + +# Extract the search term from the command line arguments +search_term = sys.argv[1] + +# Define the directory where you want to search for JSON files +search_directory = "./" + +# Define the output file name +output_file = "data.txt" + +# Construct the find command to search for JSON files containing the specified term +find_command = f'find "{search_directory}" -name "*.json" -exec grep -Hn --color "{search_term}" {{}} \\; > "{output_file}"' + +# Execute the find command +os.system(find_command) + +print(f"Search results saved to {output_file}") + +# Function to split text into chunks of MAX_CHUNK_SIZE characters or less, stopping at the nearest newline +def split_text_into_chunks(text, max_chunk_size=MAX_CHUNK_SIZE): + chunks = [] + current_chunk = "" + + for line in text.splitlines(): + if len(current_chunk) + len(line) + 1 <= max_chunk_size: + # Add line to the current chunk + if current_chunk: + current_chunk += '\n' + current_chunk += line + else: + # Start a new chunk + chunks.append(current_chunk) + current_chunk = line + + if current_chunk: + chunks.append(current_chunk) + + return chunks + +# Read the contents of the "data.txt" file +with open(output_file, 'r') as file: + file_contents = file.read() + +# Split the contents into chunks +text_chunks = split_text_into_chunks(file_contents) + +# Save the text chunks to a new file +chunked_output_file = "data_chunks.txt" +i = 0 +with open(chunked_output_file, 'w') as chunked_file: + for chunk in text_chunks: + i = i + 1 + chunk = chunk + '\n\n\n\n\n\n\n' + chunked_file.write(chunk) + print("chunk " + str(i)) + +print(f"Text chunks saved to {chunked_output_file}") + +# Initialize an empty list to store responses +responses = [] + +# Create a function to generate responses using the chat model +def generate_chat_response(prompt): + try: + print("done") + return "tmp" + response = openai.ChatCompletion.create( + model="gpt-3.5-turbo-16k", # Use the appropriate chat model + messages=[ + {"role": "system", "content": "You are a junior programmer trying to summarize user experience issues."}, + {"role": "user", "content": prompt}, + ], + stop=None, # You can specify a stop condition if necessary + temperature=0.7, # You can adjust the temperature for creativity + ) + return response.choices[0].message['content'] + except Exception as e: + return str(e) + +# Question to add to the prompt +question = """Summarize the user complaints in these JSON messages. Along with each complaint, provide the relevant user messages and file names (e.g., questions/2022-06-03.json). --- """ + + +# Iterate through each chunk and query ChatGPT with the question +for chunk in text_chunks: + prompt = f"{question}\n{chunk}" # Add the question to the chunk + response = generate_chat_response(prompt) + print(response) + responses.append(response) + +# Save the responses to a new file +responses_output_file = "responses.txt" +with open(responses_output_file, 'w') as responses_file: + for response in responses: + responses_file.write(response + '\n') + +print(f"Responses saved to {responses_output_file}") + From 13a9a5afdfb8178916433dd0a0d26a51f1ca4fc1 Mon Sep 17 00:00:00 2001 From: Preethi1609 Date: Tue, 3 Oct 2023 17:18:10 -0400 Subject: [PATCH 02/18] remove files --- README.md | 1 - apps/data/eva_paper.txt | 1 - apps/pandas_qa_local.py | 143 ------------------------------- utils/chunk.py | 70 --------------- utils/script.py | 185 ---------------------------------------- utils/search_term.py | 114 ------------------------- 6 files changed, 514 deletions(-) delete mode 100644 README.md delete mode 100644 apps/data/eva_paper.txt delete mode 100644 apps/pandas_qa_local.py delete mode 100644 utils/chunk.py delete mode 100644 utils/script.py delete mode 100644 utils/search_term.py diff --git a/README.md b/README.md deleted file mode 100644 index 4e55451..0000000 --- a/README.md +++ /dev/null @@ -1 +0,0 @@ -# pandas-ai-integration \ No newline at end of file diff --git a/apps/data/eva_paper.txt b/apps/data/eva_paper.txt deleted file mode 100644 index 1dee18f..0000000 --- a/apps/data/eva_paper.txt +++ /dev/null @@ -1 +0,0 @@ -EVA: An End-to-End Exploratory Video Analytics System Gaurav Tarlok Kakkar, Jiashen Cao, Pramod Chunduri, Zhuangdi Xu, Suryatej Reddy Vyalla, Prashanth Dintyala, Anirudh Prabakaran, Jaeho Bang, Aubhro Sengupta, Kaushik Ravichandran, Ishwarya Sivakumar, Aryan Rajoria, Ashmita Raju, Tushar Aggarwal, Abdullah Shah, Sanjana Garg, Shashank Suman, Myna Prasanna Kalluraya, † Subrata Mitra , Ali Payani‡ , Yao Lu★, Umakishore Ramachandran, Joy Arulraj Georgia Institute of Technology † Adobe, ‡ Cisco, ★Microsoft arulraj@gatech.edu Abstract In recent years, deep learning models have revolutionized computer vision, enabling diverse applications. However, these models are computationally expensive, and leveraging them for video analytics involves low-level imperative programming. To address these efficiency and usability challenges, the database community has developed video database management systems (VDBMSs). However, existing VDBMSs lack extensibility and composability and do not support holistic system optimizations, limiting their practical application. In response to these issues, we present our vision for EVA, a VDBMS that allows for extensible support of user-defined functions and employs a Cascades-style query optimizer. Additionally, we leverage Ray’s distributed execution to enhance scalability and performance and explore hardware-specific optimizations to facilitate runtime optimizations. We discuss the architecture and design of EVA, our achievements thus far, and our research roadmap. many libraries (e.g., PyTorch [28], OpenCV [5], and Pandas [26]) to leverage these vision models. To tackle these efficiency and usability challenges, database researchers have proposed several video database management systems (VDBMSs) [4, 9, 20, 22, 24, 34]. These systems improve usability by supporting declarative SQL-like queries over videos. VDBMSs have applications across several domains, including movie analysis, monitor wildlife behavior [12, 19], monitor traffic [41], analyze retail store performance [36]. For example, a movie analyst may issue the following query to study the emotion palette of actors in a movie dataset [23]: /* Movie Analysis */ SELECT E m o t i o n C l a s s i f i c a t i o n ( Crop ( data , bbox ) ) FROM MOVIE CROSS APPLY UNNEST ( FaceDetection ( data ) ) AS Face ( bbox , conf ) WHERE id < 1000 AND conf > 0.8; Listing 1: Illustrative EVAQL query ACM Reference Format: Gaurav Tarlok Kakkar, Jiashen Cao, Pramod Chunduri, Zhuangdi Xu, Suryatej Reddy Vyalla, Prashanth Dintyala, Anirudh Prabakaran, Jaeho Bang, Aubhro Sengupta, Kaushik Ravichandran, Ishwarya Sivakumar, Aryan Rajoria, Ashmita Raju, Tushar Aggarwal, Abdullah Shah, Sanjana Garg, Shashank Suman, Myna Prasanna Kalluraya,, Subrata Mitra† , Ali Payani‡ , Yao Lu★ , Umakishore Ramachandran, Joy Arulraj. 2023. EVA: An End-toEnd Exploratory Video Analytics System. In Data Management for End-toEnd Machine Learning (DEEM ’23), June 18, 2023, Seattle, WA, USA. ACM, New York, NY, USA, 5 pages. https://doi.org/10.1145/3595360.3595858 1 INTRODUCTION Advances in computer vision [11, 32] over the last decade has led to high interest among domain scientists and industry practitioners in leveraging vision models in their applications. However, there are efficiency and usability challenges associated with deploying vision pipelines in practice [20]. First, from a resource efficiency standpoint, these deep learning models are highly expensive to run on every frame of the video due to their depth (i.e., number of neural network layers). Second, from a usability standpoint, the domain scientist must do low-level imperative programming across This work is licensed under a Creative Commons Attribution International 4.0 License. DEEM ’23, June 18, 2023, Seattle, WA, USA © 2023 Copyright held by the owner/author(s). ACM ISBN 979-8-4007-0204-4/23/06. https://doi.org/10.1145/3595360.3595858 Here, the query invokes user-defined functions (UDFs) that wrap around vision models [29]. It first retrieves the bounding boxes of all the faces present in the initial 1000 frames of the MOVIE video using the FaceDetection UDF [35]. It filters out the faces for which the FaceDetection model has lower confidence (< 0.8). Next, it identifies the emotion of each confidently-detected face using EmotionClassification UDF. Prior Work. To efficiently process such queries, the state-of-theart (SoTA) VDBMSs use a suite of database-inspired optimizations. For instance, PP [24] trains a lightweight model to quickly filter out irrelevant frames (e.g., frames that are not likely to contain a person), and only runs the heavyweight models on a subset of frames that pass through the filter model. It reduces the query processing time and improves resource efficiency by reducing the number of invocations of the heavyweight oracle models. What do Existing Systems Lack? 1 Extensibility and Composability: They do not allow users to define their own user-defined functions (UDFs) for vision models, and lack the ability of compose UDFs to accomplish complex tasks (Listing 1). Furthermore, these VDBMSs mainly focus on queries over detected video objects and do not support richer vision queries like action localization [37]. 2 Holistic System Optimizations: Prior systems primarily focus on optimizing each query in isolation, even though workloads have significant overlapping computation (e.g., redundant inference using a vision model over the same frame across queries) [40]. They often use lightweight proxy models to accelerate query execution. DEEM ’23, June 18, 2023, Seattle, WA, USA Kakkar et. al. Parser (EVA Query Language) Query Optimizer (Cascades-style) LOAD VIDEO “movies/*.mp4” INTO MOVIES; SELECT id, FaceDetector(data).bboxes FROM MOVIES; Input Query Execution Engine (Derived Models, Ray, PyTorch, AQP) Storage Engine (Video + Derived Data Structures) Output EVA Figure 1: Architecture of EVA So, they do not support holistic optimization for more complex queries, both during query optimization and execution. These limitations significantly constrain the adoption of VDBMSs in practical applications. Raven [27] optimizes ML and relational pipelines with cross-query optimization. Gandhi et al. [16] utilizes tensor abstraction for trainable pipelines in AI and relational workloads. We plan to support the training pipeline in the future. Our Vision. To overcome existing limitations, we’re developing an innovative VDBMS that’s specifically designed for exploratory video analytics - EVA. EVA provides extensible support for UDFs(§ 3.1), allowing users to define bespoke UDFs based on their requirements and compose them with existing UDFs and operators to construct complex queries. For example, the FaceDetection and EmotionClassification models can be used to construct an emotion detection query. Additionally, UDFs can import third-party Python packages and execute arbitrary logic, which makes it easy for EVA to support new features in the future. To optimize query plans, EVA contains a Cascades-style query optimizer (§ 3.2) that leverages different forms of derived models and data structures. Like relational DBMSs, EVA estimates the cost of query plans by profiling operator costs and estimating predicate selectivity. It goes further by optimizing for query accuracy (§ 4.2). Moreover, EVA’s distributed Execution Engine powered by Ray (§ 3.3) provides additional scalability and performance. We’re also exploring hardware-specific optimizations and drawing inspiration from the adaptive query processing literature [13] to facilitate runtime optimizations (§ 4.3). 2 ARCHITECTURE of EVA The architecture of the EVA VDBMS is shown in Fig. 1. We first present the query language that the Parser supports. We then describe the internals of the other three components. 2.1 EVA Query Language (EVAQL) EVA’s parser supports a query language tailored for exploratory video analytics, called EVAQL. The queries in this section all concern a movie dataset. EVA stores all the videos of this dataset in the following table: MOVIE_DATA ( ID SERIAL INTEGER , VIDEO_ID INTEGER , VIDE O_FRAM E_ID INTEGER , VIDEO_NAME TEXT (30) , DATA NDARRAY UINT8 (3 , ANYDIM , ANYDIM ) ) ; Listing 2: Schema of the movie dataset Loading Data. EVA supports loading both videos and semi-structured data. The following query depicts how the user loads videos in EVA: /* Loading a video into the table */ LOAD VIDEO ' movies /*. mp4 ' INTO MOVIE_DATA ; EVA automatically creates a table called MOVIE_DATA with following columns: (1) id, (2) data, (3) video_id, (4) video_frame_id, and (5) video_name. They denote the frame identifier, the contents of the frame, and the video to which that frame belongs to. EVAQL supports queries for loading structured data (e.g., CSVs) for populating the metadata of videos (e.g., bounding boxes of faces in a frame). Similar to traditional DBMSs, the user must explicitly define the schema before loading the CSV file: /* Defining the schema and loading a CSV file */ CREATE TABLE IF NOT EXISTS M OVIE_M ETADAT A ( ID SERIAL INTEGER , VIDEO_ID INTEGER , VID EO_FRA ME_ID INTEGER , VIDEO_NAME TEXT (30) , FACE_BBOXES NDARRAY FLOAT32 (4) ) ; LOAD CSV ' movie . csv ' INTO MOVIE_ METADATA ; User-Defined Functions. EVAQL is tailored for supporting userdefined functions (UDFs). UDFs allow users to extend the VDBMS to support the requirements of their applications. In EVA, UDFs are often wrappers around deep learning models. For example, a face detection UDF takes a frame as input and returns the bounding boxes of the faces detected in the frame as output. Internally, it wraps around a FaceDetection PyTorch model [35]. EVAQL supports arbitrary UDFs that take a variety of inputs (e.g., video meta-data or raw frames etc.) and generate a variety of outputs (e.g., labels, bounding boxes, video frames, etc.). The following command registers a FaceDetection UDF in EVA: /* Registering a User - Defined Function */ CREATE UDF IF NOT EXISTS FaceDetector TYPE FaceDetection IMPL '/ udfs / face_detector . py ' PROPERTIES ( ' ACCURACY '= ' HIGH ') ; TYPE specifies the logical model type of the UDF (e.g., FaceDetection or ObjectDetection). IMPL specifies the path to the Python file containing the implementation of the UDF. Internally, EVA uses importlib for creating an importing UDF objects from the file [14]. The user can specify other metadata like the accuracy in PROPERTIES. EVA uses these properties to accelerate queries. For example, if the overall query accuracy requirement is moderate (e.g., 0.8× the oracle model), EVA uses faster (but less accurate) models of the same model type to accelerate the query. After registering the UDF, it can be executed on a video as shown in Listing 1. Interfaces. EVA currently supports EVAQL queries from both a command line interface and Jupyter notebooks. We seek to support a Pythonic dataframe API in the future. 2.2 Query Optimizer EVA’s Optimizer is based on the Cascades framework [17]. It applies a series of rules for rewriting the query and then performs cost-based optimization to generate a physical query plan . The Optimizer in a VDBMS differs from that in a relational DBMS in two ways. First, it must focus on minimizing query processing time while meeting the accuracy constraint (which often does not exist in a typical relational DBMS). Second, it is expensive to derive statistics from videos a priori as that involves running expensive deep learning models. So, while processing an ad-hoc query, the Optimizer runs vision models on a subset of frames to guide important optimization decisions (e.g., whether the query plan will meet the accuracy constraint or how should the predicates invoking vision models be ordered [22, 31, 40]). EVA: An End-to-End Exploratory Video Analytics System (a) A (b) DEEM ’23, June 18, 2023, Seattle, WA, USA A A A A (c) A UDF1(r) UDF2(r) UDF(r) UDF(r) A UDF1(r) ProxyUDF(r) UDF2(r) Figure 2: Illustrative UDF Optimization Rules – (a) UDF transformation rule that extracts the UDF from the predicate and converts to an APPLY operator, (b) UDF filtering rule that introduces a proxy UDF model for quickly filtering out irrelevant frames before executing UDF, and (c) UDF reordering rule that reorders UDFs based on their inference cost and availability of materialized results from prior queries. 2.3 Execution Engine The Execution Engine is responsible for evaluating the query plan generated by the Optimizer. While executing the plan, it leverages heterogeneous computational units (e.g., CPUs and GPUs). EVA leverages DL frameworks like PyTorch [28] for model inference. In an earlier prototype of EVA [40], the Execution Engine did not support distributed query execution. We have recently added support for distributed query execution (§ 3.3) using Ray [25]. 2.4 Storage Engine Lastly, the Storage Engine is responsible for managing the videos. In an earlier prototype of EVA [40], the Storage Engine organized the videos as a sequence of decoded frames, similar to SoTA VDBMSs [20]. However, this approach not only significantly increases the storage footprint of EVA on larger datasets but also does not provide any significant reduction in query execution time. We have subsequently redesigned the Storage Engine to manage videos in a compressed format. The Storage Engine manages structured data (e.g., bounding boxes of faces) on disk using the Parquet format [1]. It uses Arrow [30] as an in-memory columnar format for data that is being read or written using on-disk Parquet files. EVA supports defining UDFs using function decorators in Python. This allows users to migrate their existing deep learning models to EVA with a few lines of Python code. Users define the input and output formats of their models and configuration options through the decorator-based syntax. In Listing 3.1, the @setup decorator specifies the configuration options for the UDF. The user specifies the properties – whether EVA can cache results of UDF, does the UDF support batch mode execution, etc.. The @forward decorator specifies the input and output types/dimensions for the UDF. UDF from HuggingFace. Recently, HuggingFace [39] has gained popularity amongst the deep learning community for their support of various models across multiple data modalities (e.g., text, audio, video, etc.). EVA supports HuggingFace tasks and models right out of the box. Users define tasks or specify models using EVA’s declarative language: /* Registering an O b j e c t D e t e c t o r M o d e l */ CREATE UDF M y Ob j e c tD e t e ct o r TYPE HuggingFace PROPERTIES ( ' task '= ' object - detection ' , ' model '= ' facebook / detr - resnet - 50 ') Here, the user adds UDF that performs object-detection using the model facebook/detr-resnet-50. 3.2 3 PROGRESS We are implementing EVA as a Python package with Apache License [2] based on a client-server architecture [2]. We have made progress on enhancing the extensibility of EVA, and the efficacy of the Optimizer and the Execution Engine. 3.1 Extensibility - Importing UDFs EVA allows users to import their own UDFs in two ways. Users can either import their own implemented UDFs (i.e., from source) or from popular third party platform (e.g., HuggingFace [39], PyTorch). UDF from Source. # Configuring an UDF with decorators class I m a g e C l a s s i f i c a t i o n U D F : @setup ( cachable = True , batchable = True , udf_type = " I m a g e C l a s s i f i c a t i o n " ) def setup ( self ) : # prepare the UDF @forward ( i n pu t _ s ig n a t ur e s =[ PyTorchTensor ( type = NdArrayType . FLOAT32 , dimensions =(1 ,3 ,540 ,960) ) ] , o u t p u t_ s ig n a t u r e s =[ P an da s Da taf ra m e ( columns =[ " label " ] , column_types =[ NdArrayType . STR ]) ] ) def forward ( self ) : # do inference Query Optimizer - Reuse of Inference Results UDFs are often the most expensive operators in VDBMS queries. To accelerate such queries, EVA materializes the results of UDFs and reuses them while processing subsequent queries in exploratory video analytics [40]. Reusing results of UDFs in VDBMSs differs from the query plan matching algorithms in traditional DBMSs [3] that focus on expensive join operators. In contrast, in VDBMSs, UDFs frequently occur in predicates and projection lists. EVA’s optimizer supports novel general-purpose rewrite rules that are not present in SoTA VDBMSs. For example, to identify reuse opportunities, the Optimizer uses an UDF-centric rewrite rule (Fig. 2 (a)) that extracts the UDF from the predicate/projection expression and rewrites it using the CROSS APPLY operator [15]. The resulting query plan makes it feasible to explore rules like: (1) materializing and reusing results of the UDFs [40], (2) adding derived models (Fig. 2 (b)) [20, 21], (3) UDF reordering (Fig. 2 (c)), (4) UDF deduplication, and (5) introducing a video sampling operator before the UDF. Here, UDF de-duplication refers to avoiding redundant computation of a UDF that occurs multiple times in a single query. For example, if both the UDFs in the left hand side query tree in Fig. 2(c) are identical, we merge them into a single apply operator. 3.3 Execution Engine - Integrating Ray Our primary objective in integrating Ray into EVA is to support distributed query execution. We seek to initially support intraquery parallelism [18]. Consider a query that involves running DEEM ’23, June 18, 2023, Seattle, WA, USA Kakkar et. al. Exch dop=1,gpu=1 πEmotionClassifier Ainner R FaceDetector(r) π EmotionClassifier Xform to Exchange Plan Exchdop=2,gpu=1 Ainner R FaceDetector(r) Ainner R FaceDetector(r) Figure 3: Illustration of Exchange Operator — This query retrieves the emotions of all the faces detected in the video. the FaceDetection on a movie video with 13 K frames using a server with two GPUs. With a single GPU, it takes 402 s to process the query. Using Ray, EVA automatically splits the video into two partitions and uses both GPUs for model inference, reducing the query processing time to 209 s. Besides data-level parallelism, EVA also supports parallel processing of complex query predicates. For example, to evaluate: “UDF1(a) < 10 AND UDF2(b) > 20”, the VDBMS may either evaluate the two atomic predicates in parallel, or perform canonical predicate reordering and short-circuit the predicate evaluation. Exchange Operator. The Optimizer uses the exchange operator [6] to encapsulate the degree of parallelism (dop) in the query plan. The exchange operator splits the plan into two stages and configures the parallelism of the lower stage. Consider the query plan shown in Fig. 3. First, as specified by the lower exchange operator, two processes will run the FaceDetection UDF on the video. Then, the upper exchange operator indicates that a single process should run the EmotionClassification UDF on the bounding boxes of the detected faces. To leverage Ray, the Optimizer in EVA transforms the query plan into Ray actors and chains them via Ray queues. 4 ROADMAP We next describe our ongoing work and open questions in implementing EVA. We seek to continue improving the usability of EVA, and also the efficacy of the Optimizer and the Execution Engine. 4.1 Extensibility - Enhancing Querying Capability Action Queries. In our prior work in Zeus [8], we emphasized the need to improve the querying capabilities of VDBMSs to encompass action queries. Zeus assumes the availability of a vision model explicitly trained for the target action (e.g., a person riding a motorcycle). However, in real-world applications the action may rarely occur in the dataset, leading to insufficient true positive examples (i.e., class imbalance) during training. In addition, the number of ad-hoc combinations of objects and their interactions that form the actions is exponential. To overcome these challenges, we seek to pursue a more practical approach in EVA. We are investigating techniques to break ad-hoc actions into a collection of spatio-temporal predicates over the bounding boxes and the trajectories of objects across a sequence of frames [10, 33]. Similarity Search. To meet the needs of real-world applications [38], we seek to support object re-identification and similarity search queries in EVA. Consider a query that retrieves all the frames in a movie that contain a target actor. Efficiently searching for the specific actor using a target image requires the use of computationally expensive object re-identification models. We are currently investigating the integration of incremental search techniques into EVA’s Optimizer to accelerate re-identification queries. 4.2 Query Optimizer - Accuracy-Guided Optimization As in relational DBMSs, the VDBMS’s Optimizer estimates the query plan’s cost by profiling the cost of the operators and estimating the selectivity of predicates. However, there are two key differences. First, deep learning models are not always accurate. So, unlike relational DBMSs, VDBMSs cannot guarantee accurate results. This gives the Optimizer an opportunity to jointly optimize the query plan for both runtime performance and accuracy constraints. Second, the Optimizer must not treat an UDF as a black box. Instead, it should exploit the semantic properties of UDFs. For example, the Optimizer in EVA has the flexibility to pick a suitable physical model for processing a logical vision task, as long as it meets the query’s accuracy constraint. In our prior work [7], we showed how the Optimizer may dynamically pick different models for processing video chunks of varying complexity. We are investigating how to extend the Cascades-style Optimizer in EVA to jointly optimize for query execution cost and query accuracy. We seek to support complex model pipelines – proxy models, model cascades, and model ensembles. 4.3 Execution Engine - GPU-aware Optimization Resource utilization. As EVA extensively uses GPUs for query processing, it is critical to optimize query execution on GPUs. The Optimizer needs to insert the exchange operator and tune the degree-of-parallelism (DOP) parameter. The optimal DOP value depends on the model execution cost, the overall query, and the underlying data. We are investigating how to optimize this critical parameter to better leverage GPUs. Concretely, given the number of GPUs and their computational capabilities, EVA must decide where to inject the exchange operators in the query plan, and what is the suitable degree of parallelism for each operator. To achieve this, the Optimizer first generates a statically optimized plan. Later, it leverages the adaptive Execution Engine by adjusting the pipeline dynamically during execution to reduce overall processing time. Minimize data transfer cost. In queries with multiple UDFs, the same input frames may be transferred to the GPU multiple times (from the CPU) during query execution. Second, EVA only has CPU implementations of certain operators like join, predicate filtering, and cropping. That results in data transfer between CPU and GPU between different operators (e.g., 10-GB additional data movement for the query shown in Listing 1. To minimize this cost, we seek to investigate two optimizations: (1) lazy eviction and (2) operator fusion. First, with lazy eviction, the Execution Engine caches the frames on GPU if they are required by later operators in the query pipeline. Second, with operator fusion, we plan to add GPU-centric implementations of general-purpose operators (e.g., join and image cropping) to reduce data movement overhead. 5 CONCLUSION In this paper, we present our vision, current progress, and road map for future improvements on EVA, focusing on querying capability, query optimization, and query execution. We hope that EVA will enable a broader set of application developers to leverage recent advances in vision for analysing unstructured data. EVA: An End-to-End Exploratory Video Analytics System References [1] Apache Parquet. https://parquet.apache.org/. [2] EVA Video Database System. https://pypi.org/project/evadb/. [3] S. R. Alekh Jindal, Konstantions Karanasos and H. Patel. Selecting Subexpressions to Materialize at Datacenter Scale. In VLDB, 2018. [4] F. Bastani, S. He, A. Balasingam, K. Gopalakrishnan, M. Alizadeh, H. Balakrishnan, M. Cafarella, T. Kraska, and S. Madden. MIRIS: Fast Object Track Queries in Video. In SIGMOD, pages 1907–1921, 2020. [5] G. Bradski. The OpenCV Library. Dr. Dobb’s Journal of Software Tools, 2000. [6] E. Brewer. Volcano & the Exchange Operator, 2022. [7] J. Cao, K. Sarkar, R. Hadidi, J. Arulraj, and H. Kim. FiGO: Fine-Grained Query Optimization in Video Analytics. In SIGMOD, pages 559–572, 2022. event-place: Philadelphia, PA, USA. [8] P. Chunduri, J. Bang, Y. Lu, and J. Arulraj. Zeus: Efficiently Localizing Actions in Videos Using Reinforcement Learning. In SIGMOD, pages 545–558, 2022. [9] M. Daum, B. Haynes, D. He, A. Mazumdar, M. Balazinska, and A. Cheung. TASM: A Tile-Based Storage Manager for Video Analytics. ArXiv, abs/2006.02958, 2020. [10] M. Daum, E. Zhang, D. He, M. Balazinska, B. Haynes, R. Krishna, A. Craig, and A. Wirsing. VOCAL: Video Organization and Interactive Compositional AnaLytics. In CIDR, 2022. [11] J. Dean, D. Patterson, and C. Young. A new golden age in computer architecture: Empowering the machine-learning revolution. MICRO, 38(2):21–29, 2018. Publisher: IEEE. [12] J. Dellinger, C. Shores, A. Craig, S. Kachel, M. Heithaus, W. Ripple, and A. Wirsing. Predators reduce niche overlap between sympatric prey. Oikos, 12 2021. [13] A. Deshpande, Z. Ives, V. Raman, et al. Adaptive query processing. Foundations and Trends in Databases, 1(1):1–140, 2007. [14] P. S. Foundation. Importlib - the implementation of import, 2022. [15] C. Galindo-Legaria and M. Joshi. Orthogonal optimization of subqueries and aggregation. In SIGMOD ’01, 2001. [16] A. Gandhi, Y. Asada, V. Fu, A. Gemawat, L. Zhang, R. Sen, C. Curino, J. CamachoRodríguez, and M. Interlandi. The Tensor Data Platform: Towards an AI-centric Database System. CIDR, 2023. [17] G. Graefe. The Cascades Framework for Query Optimization. IEEE Data Eng. Bull., 18(3):19–29, 1995. [18] N. Hardavellas and I. Pandis. Intra-Query Parallelism, pages 1567–1568. Springer US, Boston, MA, 2009. [19] M. Heithaus, L. Dill, G. Marshall, and B. Buhleier. Habitat use and foraging behavior of tiger sharks (galeocerdo cuvier) in a seagrass ecosystem. Marine Biology, 140(2):237–248, 2002. [20] D. Kang, P. Bailis, and M. Zaharia. BlazeIt: Optimizing Declarative Aggregation and Limit Queries for Neural Network-Based Video Analytics. Proc. VLDB Endow., 13:533–546, 2019. [21] D. Kang, J. Emmons, F. Abuzaid, P. Bailis, and M. Zaharia. NoScope: Optimizing Neural Network Queries over Video at Scale. VLDB, 10(11):1586–1597, Aug. 2017. Publisher: VLDB Endowment. [22] D. Kang, F. Romero, P. D. Bailis, C. Kozyrakis, and M. Zaharia. VIVA: an end-toend system for interactive video analytics. In CIDR, 2022. [23] G. Liu, H. Shi, A. Kiani, A. Khreishah, J. Lee, N. Ansari, C. Liu, and M. M. Yousef. Smart Traffic Monitoring System using Computer Vision and Edge Computing. IEEE Transactions on Intelligent Transportation Systems, 2021. Publisher: IEEE. [24] Y. Lu, A. Chowdhery, S. Kandula, and S. Chaudhuri. Accelerating Machine Learning Inference with Probabilistic Predicates. SIGMOD, 2018. [25] P. Moritz, R. Nishihara, S. Wang, A. Tumanov, R. Liaw, E. Liang, M. Elibol, Z. Yang, W. Paul, M. I. Jordan, and I. Stoica. Ray: A distributed framework for emerging AI applications. In 13th USENIX Symposium on Operating Systems Design and Implementation (OSDI 18), pages 561–577, Carlsbad, CA, Oct. 2018. USENIX Association. [26] Pandas. pandas-dev/pandas: Pandas, Feb. 2020. [27] K. Park, K. Saur, D. Banda, R. Sen, M. Interlandi, and K. Karanasos. End-toend Optimization of Machine Learning Prediction Queries. In SIGMOD, pages 587–601, 2022. [28] A. Paszke, S. Gross, F. Massa, A. Lerer, J. Bradbury, G. Chanan, T. Killeen, Z. Lin, N. Gimelshein, L. Antiga, A. Desmaison, A. Köpf, E. Yang, Z. DeVito, M. Raison, A. Tejani, S. Chilamkurthy, B. Steiner, L. Fang, J. Bai, and S. Chintala. PyTorch: An Imperative Style, High-Performance Deep Learning Library. In NeurIPS, 2019. [29] A. Rheinländer, U. Leser, and G. Graefe. Optimization of complex dataflows with user-defined functions. ACM Computing Surveys (CSUR), 50(3):1–39, 2017. Publisher: ACM New York, NY, USA. [30] N. Richardson, I. Cook, N. Crane, D. Dunnington, R. François, J. Keane, D. Moldovan-Grünfeld, J. Ooms, and Apache Arrow. arrow: Integration to Apache Arrow, 2022. https://github.com/apache/arrow/, https://arrow.apache.org/docs/r/. [31] F. Romero, J. Hauswald, A. Partap, D. Kang, M. Zaharia, and C. Kozyrakis. Optimizing video analytics with declarative model relationships. Proc. VLDB Endow., 16(3):447–460, 2022. DEEM ’23, June 18, 2023, Seattle, WA, USA [32] O. Russakovsky, J. Deng, H. Su, J. Krause, S. Satheesh, S. Ma, Z. Huang, A. Karpathy, A. Khosla, and M. Bernstein. Imagenet large scale visual recognition challenge. IJCV, 115(3):211–252, 2015. Publisher: Springer. [33] M. A. Sakr and R. H. Güting. Spatiotemporal pattern queries. GeoInformatica, 15(3):497–540, 2011. [34] M. Satyanarayanan, P. B. Gibbons, L. B. Mummert, P. Pillai, P. Simoens, and R. Sukthankar. Cloudlet-based just-in-time indexing of iot video. In Global Internet of Things Summit, GIoTS 2017, Geneva, Switzerland, June 6-9, 2017, pages 1–8. IEEE, 2017. [35] F. Schroff, D. Kalenichenko, and J. Philbin. Facenet: A unified embedding for face recognition and clustering. In 2015 IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pages 815–823, 2015. [36] A. W. Senior, L. M. Brown, A. Hampapur, C. Shu, Y. Zhai, R. S. Feris, Y. Tian, S. Borger, and C. R. Carlson. Video analytics for retail. In AVSS, pages 423–428. IEEE Computer Society, 2007. [37] Z. Shou, D. Wang, and S.-F. Chang. Temporal action localization in untrimmed videos via multi-stage cnns. In Proceedings of the IEEE conference on computer vision and pattern recognition, pages 1049–1058, 2016. [38] T. Skopal, F. Falchi, J. Lokoc, M. L. Sapino, I. Bartolini, and M. Patella, editors. Similarity Search and Applications - 15th International Conference, SISAP 2022, Bologna, Italy, October 5-7, 2022, Proceedings, volume 13590 of Lecture Notes in Computer Science. Springer, 2022. [39] T. Wolf, L. Debut, V. Sanh, J. Chaumond, C. Delangue, A. Moi, P. Cistac, T. Rault, R. Louf, M. Funtowicz, J. Davison, S. Shleifer, P. von Platen, C. Ma, Y. Jernite, J. Plu, C. Xu, T. L. Scao, S. Gugger, M. Drame, Q. Lhoest, and A. M. Rush. Transformers: State-of-the-art natural language processing. In Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing: System Demonstrations, pages 38–45, Online, Oct. 2020. Association for Computational Linguistics. [40] Z. Xu, G. T. Kakkar, J. Arulraj, and U. Ramachandran. EVA: A Symbolic Approach to Accelerating Exploratory Video Analytics with Materialized Views. In SIGMOD, pages 602–616, 2022. [41] S. Yang, E. Bailey, Z. Yang, J. Ostrometzky, G. Zussman, I. Seskar, and Z. Kostic. COSMOS smart intersection: Edge compute and communications for bird’s eye object tracking. In PerCom, pages 1–7. IEEE, 2020. \ No newline at end of file diff --git a/apps/pandas_qa_local.py b/apps/pandas_qa_local.py deleted file mode 100644 index 2859f86..0000000 --- a/apps/pandas_qa_local.py +++ /dev/null @@ -1,143 +0,0 @@ -# coding=utf-8 -# Copyright 2018-2023 EvaDB -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import os - -from gpt4all import GPT4All -import shutil -import subprocess -from typing import Dict - - -import pandas as pd - -APP_SOURCE_DIR = os.path.abspath(os.path.dirname(__file__)) -CURRENT_WORKING_DIR = os.getcwd() # used to locate evadb_data dir - -# default file paths -DEFAULT_TEXT_FILE_PATH = os.path.join(APP_SOURCE_DIR, "data", "eva_paper.txt") -MAX_CHUNK_SIZE = 2000 - -def receive_user_input() -> Dict: - """Receives user input. - - Returns: - user_input (dict): global configurations - """ - print( - "🔮 Welcome to EvaDB! This app lets you to run data analytics on a csv file like in a conversational manner.\nYou will only need to supply a path to csv file and an OpenAI API key.\n\n" - ) - user_input = dict() - - text_file_path = str( - input("📋 Enter the text file path (press Enter to use our default text file): ") - ) - - if text_file_path == "": - text_file_path = DEFAULT_TEXT_FILE_PATH - user_input["text_file_path"] = text_file_path - - return user_input - -def generate_script(df: pd.DataFrame, question: str) -> str: - """Generates script with llm. - - Args: - question (str): question to ask to llm. - - Returns - str: script generated by llm. - """ - # generate summary - all_columns = list(df) # Creates list of all column headers - df[all_columns] = df[all_columns].astype(str) - - prompt = f"""There is a dataframe in pandas (python). The name of the - dataframe is df. This is the result of print(df.head()): - {str(df.head())}\nAssuming the dataframe is already loaded and named 'df'. Do not include pd.read_csv, do not write code to load the CSV file. Return a python script to get the answer to a question. - Question : {question}. """ - - llm = GPT4All("llama-2-7b-chat.ggmlv3.q4_0.bin") - - script_body = llm.generate(prompt) - script_body = script_body.split("```")[1].lstrip("python") - return script_body - - - -def cleanup(): - """Removes any temporary file / directory created by EvaDB.""" - if os.path.exists("evadb_data"): - shutil.rmtree("evadb_data") - -def split_text_into_chunkss(text, max_chunk_size=MAX_CHUNK_SIZE): - chunks = [] - current_chunk = "" - - for line in text.splitlines(): - if len(current_chunk) + len(line) + 1 <= max_chunk_size: - # Add line to the current chunk - if current_chunk: - current_chunk += '\n' - current_chunk += line - else: - # Start a new chunk - chunks.append(current_chunk) - current_chunk = line - - if current_chunk: - chunks.append(current_chunk) - - return chunks - -def split_text_into_chunks(text, max_chunk_size=MAX_CHUNK_SIZE): - chunks = [] - start = 0 - end = max_chunk_size - - while start < len(text): - chunk = text[start:end] - chunks.append(chunk) - start = end - end += max_chunk_size - - return chunks - -if __name__ == "__main__": - # try: - # receive input from user - user_input = receive_user_input() - df = pd.read_csv(user_input["text_file_path"], names=['text']) - with open("/home/preethi/projects/pandas-ai-integration/apps/data/eva_paper.txt", 'r') as file: - file_contents = file.read() - - # Split the contents into chunks - text_chunks = split_text_into_chunks(file_contents) - chunked_output_file = "data_chunks.txt" - i = 0 - with open(chunked_output_file, 'w') as chunked_file: - for chunk in text_chunks: - i = i + 1 - chunk = chunk + '\n\n\n\n\n\n\n' - chunked_file.write(chunk) - print("chunk " + str(i)) - - print(f"Text chunks saved to {chunked_output_file}") - print("here1") - llm = GPT4All("llama-2-7b-chat.ggmlv3.q4_0.bin") - summaries = [] - for chunk in text_chunks: - summaries.append(llm.generate("Summarize this text" + chunk)) - print("SUMMARRYYYYYY", summaries) - \ No newline at end of file diff --git a/utils/chunk.py b/utils/chunk.py deleted file mode 100644 index 5541fb6..0000000 --- a/utils/chunk.py +++ /dev/null @@ -1,70 +0,0 @@ -import pandas as pd -from evadb.catalog.catalog_type import ColumnType -from evadb.functions.abstract.abstract_function import AbstractFunction -from evadb.functions.decorators.decorators import forward, setup -from evadb.functions.decorators.io_descriptors.data_types import PandasDataframe - -class Chunk(AbstractFunction): - """ - Arguments: - None - - Input Signatures: - input_dataframe (DataFrame) : A DataFrame containing a column of strings. - - Output Signatures: - output_dataframe (DataFrame) : A DataFrame containing chunks of strings. - - Example Usage: - You can use this function to concatenate strings in a DataFrame and split them into chunks. - """ - - @property - def name(self) -> str: - return "Chunk" - - @setup(cacheable=False) - def setup(self) -> None: - # Any setup or initialization can be done here if needed - pass - - @forward( - input_signatures=[ - PandasDataframe( - columns=["text"], - column_types=[ColumnType.TEXT], - column_shapes=[(None,)], - ) - ], - output_signatures=[ - PandasDataframe( - columns=["chunks"], - column_types=[ColumnType.TEXT], - column_shapes=[(None,)], - ) - ], - ) - def forward(self, input_dataframe): - # Ensure input is provided - if input_dataframe.empty: - raise ValueError("Input DataFrame must not be empty.") - - # Define the maximum number of tokens per chunk - max_tokens_per_chunk = 100 # Adjust this value as needed - - # Initialize lists for the output DataFrame - output_strings = [] - - # Iterate over rows of the input DataFrame - for _, row in input_dataframe.iterrows(): - input_string = row["text"] - - # Split the input string into chunks of maximum tokens - chunks = [input_string[i:i + max_tokens_per_chunk] for i in range(0, len(input_string), max_tokens_per_chunk)] - - output_strings.extend(chunks) - - # Create a DataFrame with the output strings - output_dataframe = pd.DataFrame({"chunks": output_strings}) - - return output_dataframe \ No newline at end of file diff --git a/utils/script.py b/utils/script.py deleted file mode 100644 index 604c562..0000000 --- a/utils/script.py +++ /dev/null @@ -1,185 +0,0 @@ -import json -import csv -import os -import evadb -import pandas as pd - -# Specify the directory containing your JSON files and the desired CSV file name -JSON_DIRECTORY = "./atlanta" -PROJECT_NAME = "postgres" - -CSV_FILE_PATH = f'{PROJECT_NAME}.csv' - -# Initialize an empty list to store the combined data from all JSON files -combined_data = [] - -# Iterate through each JSON file in the directory -for filename in os.listdir(JSON_DIRECTORY): - if filename.endswith('.json'): - json_file_path = os.path.join(JSON_DIRECTORY, filename) - - # Open the JSON file for reading - with open(json_file_path, 'r', encoding='utf-8') as json_input_file: - # Load the JSON data from the file - json_data = json.load(json_input_file) - for json_obj in json_data: - json_obj['date'] =\ - os.path.basename(str(json_file_path)) - - # Append the JSON data to the combined_data list - combined_data.extend(json_data) - -# Specify the headers for your CSV file based on the keys present in the JSON data -# This will ensure that only common keys across all JSON objects are included -csv_headers = list(set().union(*(d.keys() for d in combined_data))) - -# Open the CSV file for writing -with open(CSV_FILE_PATH, 'w', newline='', encoding='utf-8') as csv_output_file: - # Create a CSV writer - csv_writer = csv.DictWriter(csv_output_file, fieldnames=csv_headers) - - # Write the headers to the CSV file - csv_writer.writeheader() - - # Write the combined JSON data to the CSV file - csv_writer.writerows(combined_data) - -print(f'Conversion from JSON to CSV complete. Data saved to {CSV_FILE_PATH}') - -# Specify the input CSV file and output CSV file -input_csv_file = CSV_FILE_PATH -output_csv_file = CSV_FILE_PATH - -# Define the old and new column names -old_column_name = 'metadata' -new_column_name = 'metadata_slack' - -# Read the input CSV file and create a list of rows -with open(input_csv_file, 'r', newline='', encoding='utf-8') as input_file: - # Create a CSV reader - csv_reader = csv.reader(input_file) - - # Read the header row - header = next(csv_reader) - - # Find the index of the old column name in the header - try: - old_index = header.index(old_column_name) - except ValueError: - # Handle the case where the old column name is not found in the header - print(f'Column name "{old_column_name}" not found in the header.') - exit(1) - - # Update the header with the new column name - header[old_index] = new_column_name - - # Read the rest of the rows - rows = list(csv_reader) - -# Write the modified CSV data to the output file -with open(output_csv_file, 'w', newline='', encoding='utf-8') as output_file: - # Create a CSV writer - csv_writer = csv.writer(output_file) - - # Write the updated header - csv_writer.writerow(header) - - # Write the rest of the rows - csv_writer.writerows(rows) - -print(f'Column name "{old_column_name}" has been changed to "{new_column_name}" in {output_csv_file}') - -if __name__ == "__main__": - try: - # establish evadb api cursor - print("⏳ Establishing evadb connection...") - cursor = evadb.connect().cursor() - print("✅ evadb connection setup complete!") - - print(f'{CSV_FILE_PATH}') - - cursor.query(f"DROP FUNCTION IF EXISTS Chunk;").df() - - cursor.query(f""" - CREATE FUNCTION Chunk - INPUT (text TEXT(1000)) - OUTPUT (chunks TEXT(1000)) - TYPE StringProcessing - IMPL 'chunk.py'; - """).df() - - cursor.query(f"DROP FUNCTION IF EXISTS Contains;").df() - - cursor.query(f""" - CREATE FUNCTION Contains - INPUT (input_string TEXT(1000), substring TEXT(1000)) - OUTPUT (contains BOOLEAN) - TYPE StringProcessing - IMPL 'contains.py'; - """).df() - - cursor.query(f"DROP TABLE IF EXISTS SlackCSV;").df() - - cursor.query(f"""CREATE TABLE SlackCSV( - blocks TEXT(1000), - user_profile TEXT(1000), - reply_count TEXT(1000), - edited TEXT(1000), - user TEXT(1000), - username TEXT(1000), - bot_id INTEGER, - text TEXT(1000), - user_team TEXT(1000), - replies TEXT(1000), - icons TEXT(1000), - hidden TEXT(1000), - delete_original TEXT(1000), - pinned_to TEXT(1000), - latest_reply TEXT(1000), - old_name TEXT(1000), - team TEXT(1000), - reply_users TEXT(1000), - metadata_slack TEXT(1000), - replace_original TEXT(1000), - subscribed TEXT(1000), - reply_users_count TEXT(1000), - parent_user_id TEXT(1000), - thread_ts TEXT(1000), - attachments TEXT(1000), - subtype TEXT(1000), - last_read TEXT(1000), - client_msg_id TEXT(1000), - bot_profile TEXT(1000), - reactions TEXT(1000), - files TEXT(1000), - name TEXT(1000), - inviter TEXT(1000), - upload TEXT(1000), - type TEXT(1000), - ts TEXT(1000), - purpose TEXT(1000), - source_team TEXT(1000), - date TEXT(1000) - ); - """).df() - - cursor.query(f"LOAD CSV '{CSV_FILE_PATH}' INTO SlackCSV;").df() - - pd.set_option('display.max_columns', None) # Show all columns - pd.set_option('display.expand_frame_repr', False) - pd.set_option('display.max_colwidth', None) - print("here1") - # execute a select query - select_query = cursor.query( - """SELECT Chunk(text) - FROM SlackCSV - WHERE _row_id < 100 AND Contains(text, "predict") = "True"; - """).df() - print("here2") - print(select_query) - - except Exception as e: - print("❗️ Session ended with an error.") - print(e) - -exit(0) diff --git a/utils/search_term.py b/utils/search_term.py deleted file mode 100644 index b372237..0000000 --- a/utils/search_term.py +++ /dev/null @@ -1,114 +0,0 @@ -import os -import subprocess -import sys -import openai -import evadb - -# Replace 'your-api-key' with your OpenAI API key -openai.api_key = "sk-xx" - -MAX_CHUNK_SIZE=15000 - -# Check if the search term argument is provided -if len(sys.argv) != 2: - print("Usage: python script.py ") - sys.exit(1) - - -# Extract the search term from the command line arguments -search_term = sys.argv[1] - -# Define the directory where you want to search for JSON files -search_directory = "./" - -# Define the output file name -output_file = "data.txt" - -# Construct the find command to search for JSON files containing the specified term -find_command = f'find "{search_directory}" -name "*.json" -exec grep -Hn --color "{search_term}" {{}} \\; > "{output_file}"' - -# Execute the find command -os.system(find_command) - -print(f"Search results saved to {output_file}") - -# Function to split text into chunks of MAX_CHUNK_SIZE characters or less, stopping at the nearest newline -def split_text_into_chunks(text, max_chunk_size=MAX_CHUNK_SIZE): - chunks = [] - current_chunk = "" - - for line in text.splitlines(): - if len(current_chunk) + len(line) + 1 <= max_chunk_size: - # Add line to the current chunk - if current_chunk: - current_chunk += '\n' - current_chunk += line - else: - # Start a new chunk - chunks.append(current_chunk) - current_chunk = line - - if current_chunk: - chunks.append(current_chunk) - - return chunks - -# Read the contents of the "data.txt" file -with open(output_file, 'r') as file: - file_contents = file.read() - -# Split the contents into chunks -text_chunks = split_text_into_chunks(file_contents) - -# Save the text chunks to a new file -chunked_output_file = "data_chunks.txt" -i = 0 -with open(chunked_output_file, 'w') as chunked_file: - for chunk in text_chunks: - i = i + 1 - chunk = chunk + '\n\n\n\n\n\n\n' - chunked_file.write(chunk) - print("chunk " + str(i)) - -print(f"Text chunks saved to {chunked_output_file}") - -# Initialize an empty list to store responses -responses = [] - -# Create a function to generate responses using the chat model -def generate_chat_response(prompt): - try: - print("done") - return "tmp" - response = openai.ChatCompletion.create( - model="gpt-3.5-turbo-16k", # Use the appropriate chat model - messages=[ - {"role": "system", "content": "You are a junior programmer trying to summarize user experience issues."}, - {"role": "user", "content": prompt}, - ], - stop=None, # You can specify a stop condition if necessary - temperature=0.7, # You can adjust the temperature for creativity - ) - return response.choices[0].message['content'] - except Exception as e: - return str(e) - -# Question to add to the prompt -question = """Summarize the user complaints in these JSON messages. Along with each complaint, provide the relevant user messages and file names (e.g., questions/2022-06-03.json). --- """ - - -# Iterate through each chunk and query ChatGPT with the question -for chunk in text_chunks: - prompt = f"{question}\n{chunk}" # Add the question to the chunk - response = generate_chat_response(prompt) - print(response) - responses.append(response) - -# Save the responses to a new file -responses_output_file = "responses.txt" -with open(responses_output_file, 'w') as responses_file: - for response in responses: - responses_file.write(response + '\n') - -print(f"Responses saved to {responses_output_file}") - From 6b2120a5f05f85f0d58ac9a791c27ab8b0a9905f Mon Sep 17 00:00:00 2001 From: Preethi1609 Date: Tue, 3 Oct 2023 18:08:20 -0400 Subject: [PATCH 03/18] add support for local llm models --- config.py | 3 +++ datastructure/aidDataframe.py | 10 +++++++++- functions/semantic_cache.py | 8 +++++--- 3 files changed, 17 insertions(+), 4 deletions(-) diff --git a/config.py b/config.py index 8905a18..6f4e6ec 100644 --- a/config.py +++ b/config.py @@ -1,7 +1,10 @@ class Config: def __init__(self) -> None: self.open_ai_key = "" + self.local_llm_model = "llama-2-7b-chat.ggmlv3.q4_0.bin" def get_open_ai_key(self): return self.open_ai_key + def get_local_llm_model(self): + return self.local_llm_model \ No newline at end of file diff --git a/datastructure/aidDataframe.py b/datastructure/aidDataframe.py index c9093e0..4b0652b 100644 --- a/datastructure/aidDataframe.py +++ b/datastructure/aidDataframe.py @@ -1,6 +1,7 @@ import pandas as pd import numpy as np import openai +from gpt4all import GPT4All from langchain.llms import OpenAI from langchain.agents import create_pandas_dataframe_agent from langchain.chat_models import ChatOpenAI @@ -72,6 +73,11 @@ def initialize_middleware(self): self.openai_model = "text-davinci-003" return + def initialize_local_llm_model(self): + local_llm_model = self.config.get_local_llm_model() + self.local_llm = GPT4All(local_llm_model) + return + def query_dataframe(self, query): if query not in self.cache: ans = self.llm_agent.run(query) @@ -88,7 +94,9 @@ def code_error_correction(self, query, error, old_python_code): return answer - def chat(self, prompt): + def chat(self, prompt, local=False): + if local: + return self.local_llm.generate(prompt) ans = self.llm_agent.run(prompt) return ans diff --git a/functions/semantic_cache.py b/functions/semantic_cache.py index 37a764f..e3664df 100644 --- a/functions/semantic_cache.py +++ b/functions/semantic_cache.py @@ -40,12 +40,14 @@ def name(self) -> str: def forward(self, df: pd.DataFrame) -> pd.DataFrame: query = df[0][0] + print("query is: QQQQ", query) req_df = df.drop([0], axis=1) smart_df = AIDataFrame(req_df, description="A dataframe about cars") - smart_df.initialize_middleware() - - response = smart_df.chat(query) + # smart_df.initialize_middleware() + smart_df.initialize_local_llm_model() + # response = smart_df.chat(query) + response = smart_df.chat(query, local=True) df_dict = {"response": [response]} From 631d47107c582cd34b8ef329498795c1caed7945 Mon Sep 17 00:00:00 2001 From: Preethi1609 Date: Wed, 4 Oct 2023 16:42:37 -0400 Subject: [PATCH 04/18] change prompt, add vars to eva udf, test local llm --- run_test.sh | 2 + test/test_chat_with_pandas_local_llm.py | 61 +++++++++++++++++++++++++ 2 files changed, 63 insertions(+) create mode 100644 run_test.sh create mode 100644 test/test_chat_with_pandas_local_llm.py diff --git a/run_test.sh b/run_test.sh new file mode 100644 index 0000000..5900f18 --- /dev/null +++ b/run_test.sh @@ -0,0 +1,2 @@ +export PYTHONPATH=$PWD +python3 -m unittest discover test/ diff --git a/test/test_chat_with_pandas_local_llm.py b/test/test_chat_with_pandas_local_llm.py new file mode 100644 index 0000000..d90e42d --- /dev/null +++ b/test/test_chat_with_pandas_local_llm.py @@ -0,0 +1,61 @@ +import unittest +import os +import pandas as pd +import evadb + +class TestEvaDBFunctions(unittest.TestCase): + + def setUp(self): + self.conn = evadb.connect() + self.cursor = self.conn.cursor() + print("Connected to EvaDB") + + create_function_query = f"""CREATE FUNCTION IF NOT EXISTS ChatWithPandas + IMPL './functions/semantic_cache.py' use_local_llm 'True' local_llm_model "llama-2-7b-chat.ggmlv3.q4_0.bin"; + """ + self.cursor.query("DROP FUNCTION IF EXISTS ChatWithPandas;").execute() + self.cursor.query(create_function_query).execute() + print("Created Function") + + create_table_query = """ + CREATE TABLE IF NOT EXISTS CARSDATA( + id INTEGER, + name TEXT(30), + mpg INTEGER, + cyl FLOAT(64,64), + disp FLOAT(64,64), + hp FLOAT(64,64), + drat FLOAT(64,64), + wt FLOAT(64,64), + qsec FLOAT(64,64), + vs FLOAT(64,64), + am FLOAT(64,64), + gear FLOAT(64,64), + carb FLOAT(64,64) + ); + """ + load_data_query = """ LOAD CSV 'data/cars.csv' INTO CARSDATA; + """ + + self.cursor.query(create_table_query).execute() + self.cursor.query(load_data_query).execute() + print("Loaded data") + + def test_mean_of_gear_column(self): + chat_query = "SELECT ChatWithPandas('what is the mean of the gear column', gear, name) FROM CARSDATA;" + result = self.cursor.query(chat_query).execute() + print("RESULTT-", result) + self.assertIsNotNone(result) + + def test_highest_gear_value_car(self): + chat_query = "SELECT ChatWithPandas('which car has the highest gear value', gear, name) FROM CARSDATA;" + result = self.cursor.query(chat_query).execute() + print("RESULTTT2: ", result) + self.assertIsNotNone(result) + + def tearDown(self): + self.cursor.close() + print("Closed EvaDB connection") + +if __name__ == '__main__': + unittest.main() From 9a1f31db1a65d4a027e0205773a790a0a9faf88a Mon Sep 17 00:00:00 2001 From: Preethi1609 Date: Tue, 3 Oct 2023 17:18:10 -0400 Subject: [PATCH 05/18] remove files added the file structure and basic files added config.json updated the files created a data file updates to the AI dataframe An executor function --- README.md | 1 - apps/data/eva_paper.txt | 1 - apps/pandas_qa_local.py | 143 ---------------------- chat_runner.py | 48 ++++++++ config.py | 7 ++ data/cars.csv | 33 +++++ datastructure/aidDataframe.py | 95 +++++++++++++++ functions/README.md | 0 prompts/error_correction_prompt.py | 17 +++ utils/chunk.py | 70 ----------- utils/script.py | 185 ----------------------------- utils/search_term.py | 114 ------------------ 12 files changed, 200 insertions(+), 514 deletions(-) delete mode 100644 README.md delete mode 100644 apps/data/eva_paper.txt delete mode 100644 apps/pandas_qa_local.py create mode 100644 chat_runner.py create mode 100644 config.py create mode 100644 data/cars.csv create mode 100644 datastructure/aidDataframe.py create mode 100644 functions/README.md create mode 100644 prompts/error_correction_prompt.py delete mode 100644 utils/chunk.py delete mode 100644 utils/script.py delete mode 100644 utils/search_term.py diff --git a/README.md b/README.md deleted file mode 100644 index 4e55451..0000000 --- a/README.md +++ /dev/null @@ -1 +0,0 @@ -# pandas-ai-integration \ No newline at end of file diff --git a/apps/data/eva_paper.txt b/apps/data/eva_paper.txt deleted file mode 100644 index 1dee18f..0000000 --- a/apps/data/eva_paper.txt +++ /dev/null @@ -1 +0,0 @@ -EVA: An End-to-End Exploratory Video Analytics System Gaurav Tarlok Kakkar, Jiashen Cao, Pramod Chunduri, Zhuangdi Xu, Suryatej Reddy Vyalla, Prashanth Dintyala, Anirudh Prabakaran, Jaeho Bang, Aubhro Sengupta, Kaushik Ravichandran, Ishwarya Sivakumar, Aryan Rajoria, Ashmita Raju, Tushar Aggarwal, Abdullah Shah, Sanjana Garg, Shashank Suman, Myna Prasanna Kalluraya, † Subrata Mitra , Ali Payani‡ , Yao Lu★, Umakishore Ramachandran, Joy Arulraj Georgia Institute of Technology † Adobe, ‡ Cisco, ★Microsoft arulraj@gatech.edu Abstract In recent years, deep learning models have revolutionized computer vision, enabling diverse applications. However, these models are computationally expensive, and leveraging them for video analytics involves low-level imperative programming. To address these efficiency and usability challenges, the database community has developed video database management systems (VDBMSs). However, existing VDBMSs lack extensibility and composability and do not support holistic system optimizations, limiting their practical application. In response to these issues, we present our vision for EVA, a VDBMS that allows for extensible support of user-defined functions and employs a Cascades-style query optimizer. Additionally, we leverage Ray’s distributed execution to enhance scalability and performance and explore hardware-specific optimizations to facilitate runtime optimizations. We discuss the architecture and design of EVA, our achievements thus far, and our research roadmap. many libraries (e.g., PyTorch [28], OpenCV [5], and Pandas [26]) to leverage these vision models. To tackle these efficiency and usability challenges, database researchers have proposed several video database management systems (VDBMSs) [4, 9, 20, 22, 24, 34]. These systems improve usability by supporting declarative SQL-like queries over videos. VDBMSs have applications across several domains, including movie analysis, monitor wildlife behavior [12, 19], monitor traffic [41], analyze retail store performance [36]. For example, a movie analyst may issue the following query to study the emotion palette of actors in a movie dataset [23]: /* Movie Analysis */ SELECT E m o t i o n C l a s s i f i c a t i o n ( Crop ( data , bbox ) ) FROM MOVIE CROSS APPLY UNNEST ( FaceDetection ( data ) ) AS Face ( bbox , conf ) WHERE id < 1000 AND conf > 0.8; Listing 1: Illustrative EVAQL query ACM Reference Format: Gaurav Tarlok Kakkar, Jiashen Cao, Pramod Chunduri, Zhuangdi Xu, Suryatej Reddy Vyalla, Prashanth Dintyala, Anirudh Prabakaran, Jaeho Bang, Aubhro Sengupta, Kaushik Ravichandran, Ishwarya Sivakumar, Aryan Rajoria, Ashmita Raju, Tushar Aggarwal, Abdullah Shah, Sanjana Garg, Shashank Suman, Myna Prasanna Kalluraya,, Subrata Mitra† , Ali Payani‡ , Yao Lu★ , Umakishore Ramachandran, Joy Arulraj. 2023. EVA: An End-toEnd Exploratory Video Analytics System. In Data Management for End-toEnd Machine Learning (DEEM ’23), June 18, 2023, Seattle, WA, USA. ACM, New York, NY, USA, 5 pages. https://doi.org/10.1145/3595360.3595858 1 INTRODUCTION Advances in computer vision [11, 32] over the last decade has led to high interest among domain scientists and industry practitioners in leveraging vision models in their applications. However, there are efficiency and usability challenges associated with deploying vision pipelines in practice [20]. First, from a resource efficiency standpoint, these deep learning models are highly expensive to run on every frame of the video due to their depth (i.e., number of neural network layers). Second, from a usability standpoint, the domain scientist must do low-level imperative programming across This work is licensed under a Creative Commons Attribution International 4.0 License. DEEM ’23, June 18, 2023, Seattle, WA, USA © 2023 Copyright held by the owner/author(s). ACM ISBN 979-8-4007-0204-4/23/06. https://doi.org/10.1145/3595360.3595858 Here, the query invokes user-defined functions (UDFs) that wrap around vision models [29]. It first retrieves the bounding boxes of all the faces present in the initial 1000 frames of the MOVIE video using the FaceDetection UDF [35]. It filters out the faces for which the FaceDetection model has lower confidence (< 0.8). Next, it identifies the emotion of each confidently-detected face using EmotionClassification UDF. Prior Work. To efficiently process such queries, the state-of-theart (SoTA) VDBMSs use a suite of database-inspired optimizations. For instance, PP [24] trains a lightweight model to quickly filter out irrelevant frames (e.g., frames that are not likely to contain a person), and only runs the heavyweight models on a subset of frames that pass through the filter model. It reduces the query processing time and improves resource efficiency by reducing the number of invocations of the heavyweight oracle models. What do Existing Systems Lack? 1 Extensibility and Composability: They do not allow users to define their own user-defined functions (UDFs) for vision models, and lack the ability of compose UDFs to accomplish complex tasks (Listing 1). Furthermore, these VDBMSs mainly focus on queries over detected video objects and do not support richer vision queries like action localization [37]. 2 Holistic System Optimizations: Prior systems primarily focus on optimizing each query in isolation, even though workloads have significant overlapping computation (e.g., redundant inference using a vision model over the same frame across queries) [40]. They often use lightweight proxy models to accelerate query execution. DEEM ’23, June 18, 2023, Seattle, WA, USA Kakkar et. al. Parser (EVA Query Language) Query Optimizer (Cascades-style) LOAD VIDEO “movies/*.mp4” INTO MOVIES; SELECT id, FaceDetector(data).bboxes FROM MOVIES; Input Query Execution Engine (Derived Models, Ray, PyTorch, AQP) Storage Engine (Video + Derived Data Structures) Output EVA Figure 1: Architecture of EVA So, they do not support holistic optimization for more complex queries, both during query optimization and execution. These limitations significantly constrain the adoption of VDBMSs in practical applications. Raven [27] optimizes ML and relational pipelines with cross-query optimization. Gandhi et al. [16] utilizes tensor abstraction for trainable pipelines in AI and relational workloads. We plan to support the training pipeline in the future. Our Vision. To overcome existing limitations, we’re developing an innovative VDBMS that’s specifically designed for exploratory video analytics - EVA. EVA provides extensible support for UDFs(§ 3.1), allowing users to define bespoke UDFs based on their requirements and compose them with existing UDFs and operators to construct complex queries. For example, the FaceDetection and EmotionClassification models can be used to construct an emotion detection query. Additionally, UDFs can import third-party Python packages and execute arbitrary logic, which makes it easy for EVA to support new features in the future. To optimize query plans, EVA contains a Cascades-style query optimizer (§ 3.2) that leverages different forms of derived models and data structures. Like relational DBMSs, EVA estimates the cost of query plans by profiling operator costs and estimating predicate selectivity. It goes further by optimizing for query accuracy (§ 4.2). Moreover, EVA’s distributed Execution Engine powered by Ray (§ 3.3) provides additional scalability and performance. We’re also exploring hardware-specific optimizations and drawing inspiration from the adaptive query processing literature [13] to facilitate runtime optimizations (§ 4.3). 2 ARCHITECTURE of EVA The architecture of the EVA VDBMS is shown in Fig. 1. We first present the query language that the Parser supports. We then describe the internals of the other three components. 2.1 EVA Query Language (EVAQL) EVA’s parser supports a query language tailored for exploratory video analytics, called EVAQL. The queries in this section all concern a movie dataset. EVA stores all the videos of this dataset in the following table: MOVIE_DATA ( ID SERIAL INTEGER , VIDEO_ID INTEGER , VIDE O_FRAM E_ID INTEGER , VIDEO_NAME TEXT (30) , DATA NDARRAY UINT8 (3 , ANYDIM , ANYDIM ) ) ; Listing 2: Schema of the movie dataset Loading Data. EVA supports loading both videos and semi-structured data. The following query depicts how the user loads videos in EVA: /* Loading a video into the table */ LOAD VIDEO ' movies /*. mp4 ' INTO MOVIE_DATA ; EVA automatically creates a table called MOVIE_DATA with following columns: (1) id, (2) data, (3) video_id, (4) video_frame_id, and (5) video_name. They denote the frame identifier, the contents of the frame, and the video to which that frame belongs to. EVAQL supports queries for loading structured data (e.g., CSVs) for populating the metadata of videos (e.g., bounding boxes of faces in a frame). Similar to traditional DBMSs, the user must explicitly define the schema before loading the CSV file: /* Defining the schema and loading a CSV file */ CREATE TABLE IF NOT EXISTS M OVIE_M ETADAT A ( ID SERIAL INTEGER , VIDEO_ID INTEGER , VID EO_FRA ME_ID INTEGER , VIDEO_NAME TEXT (30) , FACE_BBOXES NDARRAY FLOAT32 (4) ) ; LOAD CSV ' movie . csv ' INTO MOVIE_ METADATA ; User-Defined Functions. EVAQL is tailored for supporting userdefined functions (UDFs). UDFs allow users to extend the VDBMS to support the requirements of their applications. In EVA, UDFs are often wrappers around deep learning models. For example, a face detection UDF takes a frame as input and returns the bounding boxes of the faces detected in the frame as output. Internally, it wraps around a FaceDetection PyTorch model [35]. EVAQL supports arbitrary UDFs that take a variety of inputs (e.g., video meta-data or raw frames etc.) and generate a variety of outputs (e.g., labels, bounding boxes, video frames, etc.). The following command registers a FaceDetection UDF in EVA: /* Registering a User - Defined Function */ CREATE UDF IF NOT EXISTS FaceDetector TYPE FaceDetection IMPL '/ udfs / face_detector . py ' PROPERTIES ( ' ACCURACY '= ' HIGH ') ; TYPE specifies the logical model type of the UDF (e.g., FaceDetection or ObjectDetection). IMPL specifies the path to the Python file containing the implementation of the UDF. Internally, EVA uses importlib for creating an importing UDF objects from the file [14]. The user can specify other metadata like the accuracy in PROPERTIES. EVA uses these properties to accelerate queries. For example, if the overall query accuracy requirement is moderate (e.g., 0.8× the oracle model), EVA uses faster (but less accurate) models of the same model type to accelerate the query. After registering the UDF, it can be executed on a video as shown in Listing 1. Interfaces. EVA currently supports EVAQL queries from both a command line interface and Jupyter notebooks. We seek to support a Pythonic dataframe API in the future. 2.2 Query Optimizer EVA’s Optimizer is based on the Cascades framework [17]. It applies a series of rules for rewriting the query and then performs cost-based optimization to generate a physical query plan . The Optimizer in a VDBMS differs from that in a relational DBMS in two ways. First, it must focus on minimizing query processing time while meeting the accuracy constraint (which often does not exist in a typical relational DBMS). Second, it is expensive to derive statistics from videos a priori as that involves running expensive deep learning models. So, while processing an ad-hoc query, the Optimizer runs vision models on a subset of frames to guide important optimization decisions (e.g., whether the query plan will meet the accuracy constraint or how should the predicates invoking vision models be ordered [22, 31, 40]). EVA: An End-to-End Exploratory Video Analytics System (a) A (b) DEEM ’23, June 18, 2023, Seattle, WA, USA A A A A (c) A UDF1(r) UDF2(r) UDF(r) UDF(r) A UDF1(r) ProxyUDF(r) UDF2(r) Figure 2: Illustrative UDF Optimization Rules – (a) UDF transformation rule that extracts the UDF from the predicate and converts to an APPLY operator, (b) UDF filtering rule that introduces a proxy UDF model for quickly filtering out irrelevant frames before executing UDF, and (c) UDF reordering rule that reorders UDFs based on their inference cost and availability of materialized results from prior queries. 2.3 Execution Engine The Execution Engine is responsible for evaluating the query plan generated by the Optimizer. While executing the plan, it leverages heterogeneous computational units (e.g., CPUs and GPUs). EVA leverages DL frameworks like PyTorch [28] for model inference. In an earlier prototype of EVA [40], the Execution Engine did not support distributed query execution. We have recently added support for distributed query execution (§ 3.3) using Ray [25]. 2.4 Storage Engine Lastly, the Storage Engine is responsible for managing the videos. In an earlier prototype of EVA [40], the Storage Engine organized the videos as a sequence of decoded frames, similar to SoTA VDBMSs [20]. However, this approach not only significantly increases the storage footprint of EVA on larger datasets but also does not provide any significant reduction in query execution time. We have subsequently redesigned the Storage Engine to manage videos in a compressed format. The Storage Engine manages structured data (e.g., bounding boxes of faces) on disk using the Parquet format [1]. It uses Arrow [30] as an in-memory columnar format for data that is being read or written using on-disk Parquet files. EVA supports defining UDFs using function decorators in Python. This allows users to migrate their existing deep learning models to EVA with a few lines of Python code. Users define the input and output formats of their models and configuration options through the decorator-based syntax. In Listing 3.1, the @setup decorator specifies the configuration options for the UDF. The user specifies the properties – whether EVA can cache results of UDF, does the UDF support batch mode execution, etc.. The @forward decorator specifies the input and output types/dimensions for the UDF. UDF from HuggingFace. Recently, HuggingFace [39] has gained popularity amongst the deep learning community for their support of various models across multiple data modalities (e.g., text, audio, video, etc.). EVA supports HuggingFace tasks and models right out of the box. Users define tasks or specify models using EVA’s declarative language: /* Registering an O b j e c t D e t e c t o r M o d e l */ CREATE UDF M y Ob j e c tD e t e ct o r TYPE HuggingFace PROPERTIES ( ' task '= ' object - detection ' , ' model '= ' facebook / detr - resnet - 50 ') Here, the user adds UDF that performs object-detection using the model facebook/detr-resnet-50. 3.2 3 PROGRESS We are implementing EVA as a Python package with Apache License [2] based on a client-server architecture [2]. We have made progress on enhancing the extensibility of EVA, and the efficacy of the Optimizer and the Execution Engine. 3.1 Extensibility - Importing UDFs EVA allows users to import their own UDFs in two ways. Users can either import their own implemented UDFs (i.e., from source) or from popular third party platform (e.g., HuggingFace [39], PyTorch). UDF from Source. # Configuring an UDF with decorators class I m a g e C l a s s i f i c a t i o n U D F : @setup ( cachable = True , batchable = True , udf_type = " I m a g e C l a s s i f i c a t i o n " ) def setup ( self ) : # prepare the UDF @forward ( i n pu t _ s ig n a t ur e s =[ PyTorchTensor ( type = NdArrayType . FLOAT32 , dimensions =(1 ,3 ,540 ,960) ) ] , o u t p u t_ s ig n a t u r e s =[ P an da s Da taf ra m e ( columns =[ " label " ] , column_types =[ NdArrayType . STR ]) ] ) def forward ( self ) : # do inference Query Optimizer - Reuse of Inference Results UDFs are often the most expensive operators in VDBMS queries. To accelerate such queries, EVA materializes the results of UDFs and reuses them while processing subsequent queries in exploratory video analytics [40]. Reusing results of UDFs in VDBMSs differs from the query plan matching algorithms in traditional DBMSs [3] that focus on expensive join operators. In contrast, in VDBMSs, UDFs frequently occur in predicates and projection lists. EVA’s optimizer supports novel general-purpose rewrite rules that are not present in SoTA VDBMSs. For example, to identify reuse opportunities, the Optimizer uses an UDF-centric rewrite rule (Fig. 2 (a)) that extracts the UDF from the predicate/projection expression and rewrites it using the CROSS APPLY operator [15]. The resulting query plan makes it feasible to explore rules like: (1) materializing and reusing results of the UDFs [40], (2) adding derived models (Fig. 2 (b)) [20, 21], (3) UDF reordering (Fig. 2 (c)), (4) UDF deduplication, and (5) introducing a video sampling operator before the UDF. Here, UDF de-duplication refers to avoiding redundant computation of a UDF that occurs multiple times in a single query. For example, if both the UDFs in the left hand side query tree in Fig. 2(c) are identical, we merge them into a single apply operator. 3.3 Execution Engine - Integrating Ray Our primary objective in integrating Ray into EVA is to support distributed query execution. We seek to initially support intraquery parallelism [18]. Consider a query that involves running DEEM ’23, June 18, 2023, Seattle, WA, USA Kakkar et. al. Exch dop=1,gpu=1 πEmotionClassifier Ainner R FaceDetector(r) π EmotionClassifier Xform to Exchange Plan Exchdop=2,gpu=1 Ainner R FaceDetector(r) Ainner R FaceDetector(r) Figure 3: Illustration of Exchange Operator — This query retrieves the emotions of all the faces detected in the video. the FaceDetection on a movie video with 13 K frames using a server with two GPUs. With a single GPU, it takes 402 s to process the query. Using Ray, EVA automatically splits the video into two partitions and uses both GPUs for model inference, reducing the query processing time to 209 s. Besides data-level parallelism, EVA also supports parallel processing of complex query predicates. For example, to evaluate: “UDF1(a) < 10 AND UDF2(b) > 20”, the VDBMS may either evaluate the two atomic predicates in parallel, or perform canonical predicate reordering and short-circuit the predicate evaluation. Exchange Operator. The Optimizer uses the exchange operator [6] to encapsulate the degree of parallelism (dop) in the query plan. The exchange operator splits the plan into two stages and configures the parallelism of the lower stage. Consider the query plan shown in Fig. 3. First, as specified by the lower exchange operator, two processes will run the FaceDetection UDF on the video. Then, the upper exchange operator indicates that a single process should run the EmotionClassification UDF on the bounding boxes of the detected faces. To leverage Ray, the Optimizer in EVA transforms the query plan into Ray actors and chains them via Ray queues. 4 ROADMAP We next describe our ongoing work and open questions in implementing EVA. We seek to continue improving the usability of EVA, and also the efficacy of the Optimizer and the Execution Engine. 4.1 Extensibility - Enhancing Querying Capability Action Queries. In our prior work in Zeus [8], we emphasized the need to improve the querying capabilities of VDBMSs to encompass action queries. Zeus assumes the availability of a vision model explicitly trained for the target action (e.g., a person riding a motorcycle). However, in real-world applications the action may rarely occur in the dataset, leading to insufficient true positive examples (i.e., class imbalance) during training. In addition, the number of ad-hoc combinations of objects and their interactions that form the actions is exponential. To overcome these challenges, we seek to pursue a more practical approach in EVA. We are investigating techniques to break ad-hoc actions into a collection of spatio-temporal predicates over the bounding boxes and the trajectories of objects across a sequence of frames [10, 33]. Similarity Search. To meet the needs of real-world applications [38], we seek to support object re-identification and similarity search queries in EVA. Consider a query that retrieves all the frames in a movie that contain a target actor. Efficiently searching for the specific actor using a target image requires the use of computationally expensive object re-identification models. We are currently investigating the integration of incremental search techniques into EVA’s Optimizer to accelerate re-identification queries. 4.2 Query Optimizer - Accuracy-Guided Optimization As in relational DBMSs, the VDBMS’s Optimizer estimates the query plan’s cost by profiling the cost of the operators and estimating the selectivity of predicates. However, there are two key differences. First, deep learning models are not always accurate. So, unlike relational DBMSs, VDBMSs cannot guarantee accurate results. This gives the Optimizer an opportunity to jointly optimize the query plan for both runtime performance and accuracy constraints. Second, the Optimizer must not treat an UDF as a black box. Instead, it should exploit the semantic properties of UDFs. For example, the Optimizer in EVA has the flexibility to pick a suitable physical model for processing a logical vision task, as long as it meets the query’s accuracy constraint. In our prior work [7], we showed how the Optimizer may dynamically pick different models for processing video chunks of varying complexity. We are investigating how to extend the Cascades-style Optimizer in EVA to jointly optimize for query execution cost and query accuracy. We seek to support complex model pipelines – proxy models, model cascades, and model ensembles. 4.3 Execution Engine - GPU-aware Optimization Resource utilization. As EVA extensively uses GPUs for query processing, it is critical to optimize query execution on GPUs. The Optimizer needs to insert the exchange operator and tune the degree-of-parallelism (DOP) parameter. The optimal DOP value depends on the model execution cost, the overall query, and the underlying data. We are investigating how to optimize this critical parameter to better leverage GPUs. Concretely, given the number of GPUs and their computational capabilities, EVA must decide where to inject the exchange operators in the query plan, and what is the suitable degree of parallelism for each operator. To achieve this, the Optimizer first generates a statically optimized plan. Later, it leverages the adaptive Execution Engine by adjusting the pipeline dynamically during execution to reduce overall processing time. Minimize data transfer cost. In queries with multiple UDFs, the same input frames may be transferred to the GPU multiple times (from the CPU) during query execution. Second, EVA only has CPU implementations of certain operators like join, predicate filtering, and cropping. That results in data transfer between CPU and GPU between different operators (e.g., 10-GB additional data movement for the query shown in Listing 1. To minimize this cost, we seek to investigate two optimizations: (1) lazy eviction and (2) operator fusion. First, with lazy eviction, the Execution Engine caches the frames on GPU if they are required by later operators in the query pipeline. Second, with operator fusion, we plan to add GPU-centric implementations of general-purpose operators (e.g., join and image cropping) to reduce data movement overhead. 5 CONCLUSION In this paper, we present our vision, current progress, and road map for future improvements on EVA, focusing on querying capability, query optimization, and query execution. We hope that EVA will enable a broader set of application developers to leverage recent advances in vision for analysing unstructured data. EVA: An End-to-End Exploratory Video Analytics System References [1] Apache Parquet. https://parquet.apache.org/. [2] EVA Video Database System. https://pypi.org/project/evadb/. [3] S. R. Alekh Jindal, Konstantions Karanasos and H. Patel. Selecting Subexpressions to Materialize at Datacenter Scale. In VLDB, 2018. [4] F. Bastani, S. He, A. Balasingam, K. Gopalakrishnan, M. Alizadeh, H. Balakrishnan, M. Cafarella, T. Kraska, and S. Madden. MIRIS: Fast Object Track Queries in Video. In SIGMOD, pages 1907–1921, 2020. [5] G. Bradski. The OpenCV Library. Dr. Dobb’s Journal of Software Tools, 2000. [6] E. Brewer. Volcano & the Exchange Operator, 2022. [7] J. Cao, K. Sarkar, R. Hadidi, J. Arulraj, and H. Kim. FiGO: Fine-Grained Query Optimization in Video Analytics. In SIGMOD, pages 559–572, 2022. event-place: Philadelphia, PA, USA. [8] P. Chunduri, J. Bang, Y. Lu, and J. Arulraj. Zeus: Efficiently Localizing Actions in Videos Using Reinforcement Learning. In SIGMOD, pages 545–558, 2022. [9] M. Daum, B. Haynes, D. He, A. Mazumdar, M. Balazinska, and A. Cheung. TASM: A Tile-Based Storage Manager for Video Analytics. ArXiv, abs/2006.02958, 2020. [10] M. Daum, E. Zhang, D. He, M. Balazinska, B. Haynes, R. Krishna, A. Craig, and A. Wirsing. VOCAL: Video Organization and Interactive Compositional AnaLytics. In CIDR, 2022. [11] J. Dean, D. Patterson, and C. Young. A new golden age in computer architecture: Empowering the machine-learning revolution. MICRO, 38(2):21–29, 2018. Publisher: IEEE. [12] J. Dellinger, C. Shores, A. Craig, S. Kachel, M. Heithaus, W. Ripple, and A. Wirsing. Predators reduce niche overlap between sympatric prey. Oikos, 12 2021. [13] A. Deshpande, Z. Ives, V. Raman, et al. Adaptive query processing. Foundations and Trends in Databases, 1(1):1–140, 2007. [14] P. S. Foundation. Importlib - the implementation of import, 2022. [15] C. Galindo-Legaria and M. Joshi. Orthogonal optimization of subqueries and aggregation. In SIGMOD ’01, 2001. [16] A. Gandhi, Y. Asada, V. Fu, A. Gemawat, L. Zhang, R. Sen, C. Curino, J. CamachoRodríguez, and M. Interlandi. The Tensor Data Platform: Towards an AI-centric Database System. CIDR, 2023. [17] G. Graefe. The Cascades Framework for Query Optimization. IEEE Data Eng. Bull., 18(3):19–29, 1995. [18] N. Hardavellas and I. Pandis. Intra-Query Parallelism, pages 1567–1568. Springer US, Boston, MA, 2009. [19] M. Heithaus, L. Dill, G. Marshall, and B. Buhleier. Habitat use and foraging behavior of tiger sharks (galeocerdo cuvier) in a seagrass ecosystem. Marine Biology, 140(2):237–248, 2002. [20] D. Kang, P. Bailis, and M. Zaharia. BlazeIt: Optimizing Declarative Aggregation and Limit Queries for Neural Network-Based Video Analytics. Proc. VLDB Endow., 13:533–546, 2019. [21] D. Kang, J. Emmons, F. Abuzaid, P. Bailis, and M. Zaharia. NoScope: Optimizing Neural Network Queries over Video at Scale. VLDB, 10(11):1586–1597, Aug. 2017. Publisher: VLDB Endowment. [22] D. Kang, F. Romero, P. D. Bailis, C. Kozyrakis, and M. Zaharia. VIVA: an end-toend system for interactive video analytics. In CIDR, 2022. [23] G. Liu, H. Shi, A. Kiani, A. Khreishah, J. Lee, N. Ansari, C. Liu, and M. M. Yousef. Smart Traffic Monitoring System using Computer Vision and Edge Computing. IEEE Transactions on Intelligent Transportation Systems, 2021. Publisher: IEEE. [24] Y. Lu, A. Chowdhery, S. Kandula, and S. Chaudhuri. Accelerating Machine Learning Inference with Probabilistic Predicates. SIGMOD, 2018. [25] P. Moritz, R. Nishihara, S. Wang, A. Tumanov, R. Liaw, E. Liang, M. Elibol, Z. Yang, W. Paul, M. I. Jordan, and I. Stoica. Ray: A distributed framework for emerging AI applications. In 13th USENIX Symposium on Operating Systems Design and Implementation (OSDI 18), pages 561–577, Carlsbad, CA, Oct. 2018. USENIX Association. [26] Pandas. pandas-dev/pandas: Pandas, Feb. 2020. [27] K. Park, K. Saur, D. Banda, R. Sen, M. Interlandi, and K. Karanasos. End-toend Optimization of Machine Learning Prediction Queries. In SIGMOD, pages 587–601, 2022. [28] A. Paszke, S. Gross, F. Massa, A. Lerer, J. Bradbury, G. Chanan, T. Killeen, Z. Lin, N. Gimelshein, L. Antiga, A. Desmaison, A. Köpf, E. Yang, Z. DeVito, M. Raison, A. Tejani, S. Chilamkurthy, B. Steiner, L. Fang, J. Bai, and S. Chintala. PyTorch: An Imperative Style, High-Performance Deep Learning Library. In NeurIPS, 2019. [29] A. Rheinländer, U. Leser, and G. Graefe. Optimization of complex dataflows with user-defined functions. ACM Computing Surveys (CSUR), 50(3):1–39, 2017. Publisher: ACM New York, NY, USA. [30] N. Richardson, I. Cook, N. Crane, D. Dunnington, R. François, J. Keane, D. Moldovan-Grünfeld, J. Ooms, and Apache Arrow. arrow: Integration to Apache Arrow, 2022. https://github.com/apache/arrow/, https://arrow.apache.org/docs/r/. [31] F. Romero, J. Hauswald, A. Partap, D. Kang, M. Zaharia, and C. Kozyrakis. Optimizing video analytics with declarative model relationships. Proc. VLDB Endow., 16(3):447–460, 2022. DEEM ’23, June 18, 2023, Seattle, WA, USA [32] O. Russakovsky, J. Deng, H. Su, J. Krause, S. Satheesh, S. Ma, Z. Huang, A. Karpathy, A. Khosla, and M. Bernstein. Imagenet large scale visual recognition challenge. IJCV, 115(3):211–252, 2015. Publisher: Springer. [33] M. A. Sakr and R. H. Güting. Spatiotemporal pattern queries. GeoInformatica, 15(3):497–540, 2011. [34] M. Satyanarayanan, P. B. Gibbons, L. B. Mummert, P. Pillai, P. Simoens, and R. Sukthankar. Cloudlet-based just-in-time indexing of iot video. In Global Internet of Things Summit, GIoTS 2017, Geneva, Switzerland, June 6-9, 2017, pages 1–8. IEEE, 2017. [35] F. Schroff, D. Kalenichenko, and J. Philbin. Facenet: A unified embedding for face recognition and clustering. In 2015 IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pages 815–823, 2015. [36] A. W. Senior, L. M. Brown, A. Hampapur, C. Shu, Y. Zhai, R. S. Feris, Y. Tian, S. Borger, and C. R. Carlson. Video analytics for retail. In AVSS, pages 423–428. IEEE Computer Society, 2007. [37] Z. Shou, D. Wang, and S.-F. Chang. Temporal action localization in untrimmed videos via multi-stage cnns. In Proceedings of the IEEE conference on computer vision and pattern recognition, pages 1049–1058, 2016. [38] T. Skopal, F. Falchi, J. Lokoc, M. L. Sapino, I. Bartolini, and M. Patella, editors. Similarity Search and Applications - 15th International Conference, SISAP 2022, Bologna, Italy, October 5-7, 2022, Proceedings, volume 13590 of Lecture Notes in Computer Science. Springer, 2022. [39] T. Wolf, L. Debut, V. Sanh, J. Chaumond, C. Delangue, A. Moi, P. Cistac, T. Rault, R. Louf, M. Funtowicz, J. Davison, S. Shleifer, P. von Platen, C. Ma, Y. Jernite, J. Plu, C. Xu, T. L. Scao, S. Gugger, M. Drame, Q. Lhoest, and A. M. Rush. Transformers: State-of-the-art natural language processing. In Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing: System Demonstrations, pages 38–45, Online, Oct. 2020. Association for Computational Linguistics. [40] Z. Xu, G. T. Kakkar, J. Arulraj, and U. Ramachandran. EVA: A Symbolic Approach to Accelerating Exploratory Video Analytics with Materialized Views. In SIGMOD, pages 602–616, 2022. [41] S. Yang, E. Bailey, Z. Yang, J. Ostrometzky, G. Zussman, I. Seskar, and Z. Kostic. COSMOS smart intersection: Edge compute and communications for bird’s eye object tracking. In PerCom, pages 1–7. IEEE, 2020. \ No newline at end of file diff --git a/apps/pandas_qa_local.py b/apps/pandas_qa_local.py deleted file mode 100644 index 2859f86..0000000 --- a/apps/pandas_qa_local.py +++ /dev/null @@ -1,143 +0,0 @@ -# coding=utf-8 -# Copyright 2018-2023 EvaDB -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import os - -from gpt4all import GPT4All -import shutil -import subprocess -from typing import Dict - - -import pandas as pd - -APP_SOURCE_DIR = os.path.abspath(os.path.dirname(__file__)) -CURRENT_WORKING_DIR = os.getcwd() # used to locate evadb_data dir - -# default file paths -DEFAULT_TEXT_FILE_PATH = os.path.join(APP_SOURCE_DIR, "data", "eva_paper.txt") -MAX_CHUNK_SIZE = 2000 - -def receive_user_input() -> Dict: - """Receives user input. - - Returns: - user_input (dict): global configurations - """ - print( - "🔮 Welcome to EvaDB! This app lets you to run data analytics on a csv file like in a conversational manner.\nYou will only need to supply a path to csv file and an OpenAI API key.\n\n" - ) - user_input = dict() - - text_file_path = str( - input("📋 Enter the text file path (press Enter to use our default text file): ") - ) - - if text_file_path == "": - text_file_path = DEFAULT_TEXT_FILE_PATH - user_input["text_file_path"] = text_file_path - - return user_input - -def generate_script(df: pd.DataFrame, question: str) -> str: - """Generates script with llm. - - Args: - question (str): question to ask to llm. - - Returns - str: script generated by llm. - """ - # generate summary - all_columns = list(df) # Creates list of all column headers - df[all_columns] = df[all_columns].astype(str) - - prompt = f"""There is a dataframe in pandas (python). The name of the - dataframe is df. This is the result of print(df.head()): - {str(df.head())}\nAssuming the dataframe is already loaded and named 'df'. Do not include pd.read_csv, do not write code to load the CSV file. Return a python script to get the answer to a question. - Question : {question}. """ - - llm = GPT4All("llama-2-7b-chat.ggmlv3.q4_0.bin") - - script_body = llm.generate(prompt) - script_body = script_body.split("```")[1].lstrip("python") - return script_body - - - -def cleanup(): - """Removes any temporary file / directory created by EvaDB.""" - if os.path.exists("evadb_data"): - shutil.rmtree("evadb_data") - -def split_text_into_chunkss(text, max_chunk_size=MAX_CHUNK_SIZE): - chunks = [] - current_chunk = "" - - for line in text.splitlines(): - if len(current_chunk) + len(line) + 1 <= max_chunk_size: - # Add line to the current chunk - if current_chunk: - current_chunk += '\n' - current_chunk += line - else: - # Start a new chunk - chunks.append(current_chunk) - current_chunk = line - - if current_chunk: - chunks.append(current_chunk) - - return chunks - -def split_text_into_chunks(text, max_chunk_size=MAX_CHUNK_SIZE): - chunks = [] - start = 0 - end = max_chunk_size - - while start < len(text): - chunk = text[start:end] - chunks.append(chunk) - start = end - end += max_chunk_size - - return chunks - -if __name__ == "__main__": - # try: - # receive input from user - user_input = receive_user_input() - df = pd.read_csv(user_input["text_file_path"], names=['text']) - with open("/home/preethi/projects/pandas-ai-integration/apps/data/eva_paper.txt", 'r') as file: - file_contents = file.read() - - # Split the contents into chunks - text_chunks = split_text_into_chunks(file_contents) - chunked_output_file = "data_chunks.txt" - i = 0 - with open(chunked_output_file, 'w') as chunked_file: - for chunk in text_chunks: - i = i + 1 - chunk = chunk + '\n\n\n\n\n\n\n' - chunked_file.write(chunk) - print("chunk " + str(i)) - - print(f"Text chunks saved to {chunked_output_file}") - print("here1") - llm = GPT4All("llama-2-7b-chat.ggmlv3.q4_0.bin") - summaries = [] - for chunk in text_chunks: - summaries.append(llm.generate("Summarize this text" + chunk)) - print("SUMMARRYYYYYY", summaries) - \ No newline at end of file diff --git a/chat_runner.py b/chat_runner.py new file mode 100644 index 0000000..cf61d66 --- /dev/null +++ b/chat_runner.py @@ -0,0 +1,48 @@ +import os +import pandas as pd +import evadb + +cursor = evadb.connect().cursor() +print("Connected to EvaDB") + +create_function_query = f"""CREATE FUNCTION IF NOT EXISTS ChatWithPandas + IMPL './functions/semantic_cache.py'; + """ +cursor.query("DROP FUNCTION IF EXISTS ChatWithPandas;").execute() +cursor.query(create_function_query).execute() +print("Created Function") + +create_table_query = f""" +CREATE TABLE IF NOT EXISTS CARSDATA( +id INTEGER, +name TEXT(30), +mpg INTEGER, +cyl FLOAT(64,64), +disp FLOAT(64,64), +hp FLOAT(64,64), +drat FLOAT(64,64), +wt FLOAT(64,64), +qsec FLOAT(64,64), +vs FLOAT(64,64), +am FLOAT(64,64), +gear FLOAT(64,64), +carb FLOAT(64,64) +); +""" +load_data_query = f""" LOAD CSV 'data/cars.csv' INTO CARSDATA; +""" + +cursor.query(create_table_query).execute() +cursor.query(load_data_query).execute() +print("loaded data") + +chat_query1 = f""" SELECT ChatWithPandas('what is the mean of the gear column',gear, name) FROM CARSDATA; +""" + +result1 = cursor.query(chat_query1).execute() +print(result1) + +chat_query2 = f""" SELECT ChatWithPandas('which car has the highest gear value',gear, name) FROM CARSDATA; +""" +result2 = cursor.query(chat_query2).execute() +print(result2) \ No newline at end of file diff --git a/config.py b/config.py new file mode 100644 index 0000000..8905a18 --- /dev/null +++ b/config.py @@ -0,0 +1,7 @@ +class Config: + def __init__(self) -> None: + self.open_ai_key = "" + + def get_open_ai_key(self): + return self.open_ai_key + \ No newline at end of file diff --git a/data/cars.csv b/data/cars.csv new file mode 100644 index 0000000..6a750e8 --- /dev/null +++ b/data/cars.csv @@ -0,0 +1,33 @@ +id,name,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb +0,Mazda RX4,21.0,6,160.0,110,3.9,2.62,16.46,0,1,4,4 +1,Mazda RX4 Wag,21.0,6,160.0,110,3.9,2.875,17.02,0,1,4,4 +2,Datsun 710,22.8,4,108.0,93,3.85,2.32,18.61,1,1,4,1 +3,Hornet 4 Drive,21.4,6,258.0,110,3.08,3.215,19.44,1,0,3,1 +4,Hornet Sportabout,18.7,8,360.0,175,3.15,3.44,17.02,0,0,3,2 +5,Valiant,18.1,6,225.0,105,2.76,3.46,20.22,1,0,3,1 +6,Duster 360,14.3,8,360.0,245,3.21,3.57,15.84,0,0,3,4 +7,Merc 240D,24.4,4,146.7,62,3.69,3.19,20.0,1,0,4,2 +8,Merc 230,22.8,4,140.8,95,3.92,3.15,22.9,1,0,4,2 +9,Merc 280,19.2,6,167.6,123,3.92,3.44,18.3,1,0,4,4 +10,Merc 280C,17.8,6,167.6,123,3.92,3.44,18.9,1,0,4,4 +11,Merc 450SE,16.4,8,275.8,180,3.07,4.07,17.4,0,0,3,3 +12,Merc 450SL,17.3,8,275.8,180,3.07,3.73,17.6,0,0,3,3 +13,Merc 450SLC,15.2,8,275.8,180,3.07,3.78,18.0,0,0,3,3 +14,Cadillac Fleetwood,10.4,8,472.0,205,2.93,5.25,17.98,0,0,3,4 +15,Lincoln Continental,10.4,8,460.0,215,3.0,5.424,17.82,0,0,3,4 +16,Chrysler Imperial,14.7,8,440.0,230,3.23,5.345,17.42,0,0,3,4 +17,Fiat 128,32.4,4,78.7,66,4.08,2.2,19.47,1,1,4,1 +18,Honda Civic,30.4,4,75.7,52,4.93,1.615,18.52,1,1,4,2 +19,Toyota Corolla,33.9,4,71.1,65,4.22,1.835,19.9,1,1,4,1 +20,Toyota Corona,21.5,4,120.1,97,3.7,2.465,20.01,1,0,3,1 +21,Dodge Challenger,15.5,8,318.0,150,2.76,3.52,16.87,0,0,3,2 +22,AMC Javelin,15.2,8,304.0,150,3.15,3.435,17.3,0,0,3,2 +23,Camaro Z28,13.3,8,350.0,245,3.73,3.84,15.41,0,0,3,4 +24,Pontiac Firebird,19.2,8,400.0,175,3.08,3.845,17.05,0,0,3,2 +25,Fiat X1-9,27.3,4,79.0,66,4.08,1.935,18.9,1,1,4,1 +26,Porsche 914-2,26.0,4,120.3,91,4.43,2.14,16.7,0,1,5,2 +27,Lotus Europa,30.4,4,95.1,113,3.77,1.513,16.9,1,1,5,2 +28,Ford Pantera L,15.8,8,351.0,264,4.22,3.17,14.5,0,1,5,4 +29,Ferrari Dino,19.7,6,145.0,175,3.62,2.77,15.5,0,1,5,6 +30,Maserati Bora,15.0,8,301.0,335,3.54,3.57,14.6,0,1,5,8 +31,Volvo 142E,21.4,4,121.0,109,4.11,2.78,18.6,1,1,4,2 diff --git a/datastructure/aidDataframe.py b/datastructure/aidDataframe.py new file mode 100644 index 0000000..c9093e0 --- /dev/null +++ b/datastructure/aidDataframe.py @@ -0,0 +1,95 @@ +import pandas as pd +import numpy as np +import openai +from langchain.llms import OpenAI +from langchain.agents import create_pandas_dataframe_agent +from langchain.chat_models import ChatOpenAI +from langchain.agents.agent_types import AgentType +from prompts.error_correction_prompt import ErrorCorrectionPrompt +from config import Config + + +class AIDataFrame(pd.DataFrame): + def __init__(self, df, config=None, description=None, name=None) -> None: + super().__init__(df) + + #initialize pandas dataframe + self.pd_df = df + self.config = Config() + + if len(df)>0: + self.is_df_loaded = True + else: + self.is_df_loaded = False + + #set the description + self.description = description + + #set the config + if config: + self.config = config + + #set name + self.name = name + + #setup cache + self.cache = {} + + @property + def col_count(self): + if self.is_df_loaded: + return len(list(self.pd_df.columns)) + + @property + def row_count(self): + if self.is_df_loaded: + return len(self.pd_df) + + @property + def sample_head_csv(self): + if self.is_df_loaded: + return self.pd_df.head(5).to_csv() + + + @property + def metadata(self): + return self.pd_df.info() + + def to_csv(self, file_path): + self.pd_df.to_csv(file_path) + + + def clear_cache(self): + self.cache = {} + + + def initialize_middleware(self): + open_ai_key = self.config.get_open_ai_key() + + self.llm_agent = create_pandas_dataframe_agent(OpenAI(temperature=0, openai_api_key=open_ai_key), \ + self.pd_df, verbose=False) + openai.api_key = open_ai_key + self.openai_model = "text-davinci-003" + return + + def query_dataframe(self, query): + if query not in self.cache: + ans = self.llm_agent.run(query) + self.cache[query] = ans + else: + ans= self.cache[query] + return ans + + def code_error_correction(self, query, error, old_python_code): + prompt = ErrorCorrectionPrompt().get_prompt(self.pd_df, query, error, old_python_code) + #print(prompt) + response = openai.Completion.create(engine = self.openai_model, prompt = prompt) + answer = response.choices[0].text + + return answer + + def chat(self, prompt): + ans = self.llm_agent.run(prompt) + return ans + + diff --git a/functions/README.md b/functions/README.md new file mode 100644 index 0000000..e69de29 diff --git a/prompts/error_correction_prompt.py b/prompts/error_correction_prompt.py new file mode 100644 index 0000000..768ddf5 --- /dev/null +++ b/prompts/error_correction_prompt.py @@ -0,0 +1,17 @@ +class ErrorCorrectionPrompt: + def __init__(self) -> None: + pass + + def get_prompt(self, df, *argv): + user_question = argv[0] + error = argv[1] + python_code = argv[2] + + text = f""" + Given the following pandas dataframe {df}, and the following question was asked {user_question}. + The following python code was generated : {python_code}. + This code gave the following error: {error}. + Correct the python code and return a new python code (do not import anything) that fixes the above mentioned error. + Do not generate the same code again. + """ + return text \ No newline at end of file diff --git a/utils/chunk.py b/utils/chunk.py deleted file mode 100644 index 5541fb6..0000000 --- a/utils/chunk.py +++ /dev/null @@ -1,70 +0,0 @@ -import pandas as pd -from evadb.catalog.catalog_type import ColumnType -from evadb.functions.abstract.abstract_function import AbstractFunction -from evadb.functions.decorators.decorators import forward, setup -from evadb.functions.decorators.io_descriptors.data_types import PandasDataframe - -class Chunk(AbstractFunction): - """ - Arguments: - None - - Input Signatures: - input_dataframe (DataFrame) : A DataFrame containing a column of strings. - - Output Signatures: - output_dataframe (DataFrame) : A DataFrame containing chunks of strings. - - Example Usage: - You can use this function to concatenate strings in a DataFrame and split them into chunks. - """ - - @property - def name(self) -> str: - return "Chunk" - - @setup(cacheable=False) - def setup(self) -> None: - # Any setup or initialization can be done here if needed - pass - - @forward( - input_signatures=[ - PandasDataframe( - columns=["text"], - column_types=[ColumnType.TEXT], - column_shapes=[(None,)], - ) - ], - output_signatures=[ - PandasDataframe( - columns=["chunks"], - column_types=[ColumnType.TEXT], - column_shapes=[(None,)], - ) - ], - ) - def forward(self, input_dataframe): - # Ensure input is provided - if input_dataframe.empty: - raise ValueError("Input DataFrame must not be empty.") - - # Define the maximum number of tokens per chunk - max_tokens_per_chunk = 100 # Adjust this value as needed - - # Initialize lists for the output DataFrame - output_strings = [] - - # Iterate over rows of the input DataFrame - for _, row in input_dataframe.iterrows(): - input_string = row["text"] - - # Split the input string into chunks of maximum tokens - chunks = [input_string[i:i + max_tokens_per_chunk] for i in range(0, len(input_string), max_tokens_per_chunk)] - - output_strings.extend(chunks) - - # Create a DataFrame with the output strings - output_dataframe = pd.DataFrame({"chunks": output_strings}) - - return output_dataframe \ No newline at end of file diff --git a/utils/script.py b/utils/script.py deleted file mode 100644 index 604c562..0000000 --- a/utils/script.py +++ /dev/null @@ -1,185 +0,0 @@ -import json -import csv -import os -import evadb -import pandas as pd - -# Specify the directory containing your JSON files and the desired CSV file name -JSON_DIRECTORY = "./atlanta" -PROJECT_NAME = "postgres" - -CSV_FILE_PATH = f'{PROJECT_NAME}.csv' - -# Initialize an empty list to store the combined data from all JSON files -combined_data = [] - -# Iterate through each JSON file in the directory -for filename in os.listdir(JSON_DIRECTORY): - if filename.endswith('.json'): - json_file_path = os.path.join(JSON_DIRECTORY, filename) - - # Open the JSON file for reading - with open(json_file_path, 'r', encoding='utf-8') as json_input_file: - # Load the JSON data from the file - json_data = json.load(json_input_file) - for json_obj in json_data: - json_obj['date'] =\ - os.path.basename(str(json_file_path)) - - # Append the JSON data to the combined_data list - combined_data.extend(json_data) - -# Specify the headers for your CSV file based on the keys present in the JSON data -# This will ensure that only common keys across all JSON objects are included -csv_headers = list(set().union(*(d.keys() for d in combined_data))) - -# Open the CSV file for writing -with open(CSV_FILE_PATH, 'w', newline='', encoding='utf-8') as csv_output_file: - # Create a CSV writer - csv_writer = csv.DictWriter(csv_output_file, fieldnames=csv_headers) - - # Write the headers to the CSV file - csv_writer.writeheader() - - # Write the combined JSON data to the CSV file - csv_writer.writerows(combined_data) - -print(f'Conversion from JSON to CSV complete. Data saved to {CSV_FILE_PATH}') - -# Specify the input CSV file and output CSV file -input_csv_file = CSV_FILE_PATH -output_csv_file = CSV_FILE_PATH - -# Define the old and new column names -old_column_name = 'metadata' -new_column_name = 'metadata_slack' - -# Read the input CSV file and create a list of rows -with open(input_csv_file, 'r', newline='', encoding='utf-8') as input_file: - # Create a CSV reader - csv_reader = csv.reader(input_file) - - # Read the header row - header = next(csv_reader) - - # Find the index of the old column name in the header - try: - old_index = header.index(old_column_name) - except ValueError: - # Handle the case where the old column name is not found in the header - print(f'Column name "{old_column_name}" not found in the header.') - exit(1) - - # Update the header with the new column name - header[old_index] = new_column_name - - # Read the rest of the rows - rows = list(csv_reader) - -# Write the modified CSV data to the output file -with open(output_csv_file, 'w', newline='', encoding='utf-8') as output_file: - # Create a CSV writer - csv_writer = csv.writer(output_file) - - # Write the updated header - csv_writer.writerow(header) - - # Write the rest of the rows - csv_writer.writerows(rows) - -print(f'Column name "{old_column_name}" has been changed to "{new_column_name}" in {output_csv_file}') - -if __name__ == "__main__": - try: - # establish evadb api cursor - print("⏳ Establishing evadb connection...") - cursor = evadb.connect().cursor() - print("✅ evadb connection setup complete!") - - print(f'{CSV_FILE_PATH}') - - cursor.query(f"DROP FUNCTION IF EXISTS Chunk;").df() - - cursor.query(f""" - CREATE FUNCTION Chunk - INPUT (text TEXT(1000)) - OUTPUT (chunks TEXT(1000)) - TYPE StringProcessing - IMPL 'chunk.py'; - """).df() - - cursor.query(f"DROP FUNCTION IF EXISTS Contains;").df() - - cursor.query(f""" - CREATE FUNCTION Contains - INPUT (input_string TEXT(1000), substring TEXT(1000)) - OUTPUT (contains BOOLEAN) - TYPE StringProcessing - IMPL 'contains.py'; - """).df() - - cursor.query(f"DROP TABLE IF EXISTS SlackCSV;").df() - - cursor.query(f"""CREATE TABLE SlackCSV( - blocks TEXT(1000), - user_profile TEXT(1000), - reply_count TEXT(1000), - edited TEXT(1000), - user TEXT(1000), - username TEXT(1000), - bot_id INTEGER, - text TEXT(1000), - user_team TEXT(1000), - replies TEXT(1000), - icons TEXT(1000), - hidden TEXT(1000), - delete_original TEXT(1000), - pinned_to TEXT(1000), - latest_reply TEXT(1000), - old_name TEXT(1000), - team TEXT(1000), - reply_users TEXT(1000), - metadata_slack TEXT(1000), - replace_original TEXT(1000), - subscribed TEXT(1000), - reply_users_count TEXT(1000), - parent_user_id TEXT(1000), - thread_ts TEXT(1000), - attachments TEXT(1000), - subtype TEXT(1000), - last_read TEXT(1000), - client_msg_id TEXT(1000), - bot_profile TEXT(1000), - reactions TEXT(1000), - files TEXT(1000), - name TEXT(1000), - inviter TEXT(1000), - upload TEXT(1000), - type TEXT(1000), - ts TEXT(1000), - purpose TEXT(1000), - source_team TEXT(1000), - date TEXT(1000) - ); - """).df() - - cursor.query(f"LOAD CSV '{CSV_FILE_PATH}' INTO SlackCSV;").df() - - pd.set_option('display.max_columns', None) # Show all columns - pd.set_option('display.expand_frame_repr', False) - pd.set_option('display.max_colwidth', None) - print("here1") - # execute a select query - select_query = cursor.query( - """SELECT Chunk(text) - FROM SlackCSV - WHERE _row_id < 100 AND Contains(text, "predict") = "True"; - """).df() - print("here2") - print(select_query) - - except Exception as e: - print("❗️ Session ended with an error.") - print(e) - -exit(0) diff --git a/utils/search_term.py b/utils/search_term.py deleted file mode 100644 index b372237..0000000 --- a/utils/search_term.py +++ /dev/null @@ -1,114 +0,0 @@ -import os -import subprocess -import sys -import openai -import evadb - -# Replace 'your-api-key' with your OpenAI API key -openai.api_key = "sk-xx" - -MAX_CHUNK_SIZE=15000 - -# Check if the search term argument is provided -if len(sys.argv) != 2: - print("Usage: python script.py ") - sys.exit(1) - - -# Extract the search term from the command line arguments -search_term = sys.argv[1] - -# Define the directory where you want to search for JSON files -search_directory = "./" - -# Define the output file name -output_file = "data.txt" - -# Construct the find command to search for JSON files containing the specified term -find_command = f'find "{search_directory}" -name "*.json" -exec grep -Hn --color "{search_term}" {{}} \\; > "{output_file}"' - -# Execute the find command -os.system(find_command) - -print(f"Search results saved to {output_file}") - -# Function to split text into chunks of MAX_CHUNK_SIZE characters or less, stopping at the nearest newline -def split_text_into_chunks(text, max_chunk_size=MAX_CHUNK_SIZE): - chunks = [] - current_chunk = "" - - for line in text.splitlines(): - if len(current_chunk) + len(line) + 1 <= max_chunk_size: - # Add line to the current chunk - if current_chunk: - current_chunk += '\n' - current_chunk += line - else: - # Start a new chunk - chunks.append(current_chunk) - current_chunk = line - - if current_chunk: - chunks.append(current_chunk) - - return chunks - -# Read the contents of the "data.txt" file -with open(output_file, 'r') as file: - file_contents = file.read() - -# Split the contents into chunks -text_chunks = split_text_into_chunks(file_contents) - -# Save the text chunks to a new file -chunked_output_file = "data_chunks.txt" -i = 0 -with open(chunked_output_file, 'w') as chunked_file: - for chunk in text_chunks: - i = i + 1 - chunk = chunk + '\n\n\n\n\n\n\n' - chunked_file.write(chunk) - print("chunk " + str(i)) - -print(f"Text chunks saved to {chunked_output_file}") - -# Initialize an empty list to store responses -responses = [] - -# Create a function to generate responses using the chat model -def generate_chat_response(prompt): - try: - print("done") - return "tmp" - response = openai.ChatCompletion.create( - model="gpt-3.5-turbo-16k", # Use the appropriate chat model - messages=[ - {"role": "system", "content": "You are a junior programmer trying to summarize user experience issues."}, - {"role": "user", "content": prompt}, - ], - stop=None, # You can specify a stop condition if necessary - temperature=0.7, # You can adjust the temperature for creativity - ) - return response.choices[0].message['content'] - except Exception as e: - return str(e) - -# Question to add to the prompt -question = """Summarize the user complaints in these JSON messages. Along with each complaint, provide the relevant user messages and file names (e.g., questions/2022-06-03.json). --- """ - - -# Iterate through each chunk and query ChatGPT with the question -for chunk in text_chunks: - prompt = f"{question}\n{chunk}" # Add the question to the chunk - response = generate_chat_response(prompt) - print(response) - responses.append(response) - -# Save the responses to a new file -responses_output_file = "responses.txt" -with open(responses_output_file, 'w') as responses_file: - for response in responses: - responses_file.write(response + '\n') - -print(f"Responses saved to {responses_output_file}") - From 404379c8f4e33419dc1597bfaa53843945b07a53 Mon Sep 17 00:00:00 2001 From: Ishwarya Sivakumar Date: Thu, 28 Sep 2023 01:24:45 -0400 Subject: [PATCH 06/18] the semantic_cache udf --- functions/semantic_cache.py | 54 +++++++++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) create mode 100644 functions/semantic_cache.py diff --git a/functions/semantic_cache.py b/functions/semantic_cache.py new file mode 100644 index 0000000..37a764f --- /dev/null +++ b/functions/semantic_cache.py @@ -0,0 +1,54 @@ + +import pandas as pd + +from evadb.catalog.catalog_type import NdArrayType +from evadb.functions.abstract.abstract_function import AbstractFunction +from evadb.functions.decorators.decorators import forward, setup +from evadb.functions.decorators.io_descriptors.data_types import PandasDataframe +from evadb.functions.gpu_compatible import GPUCompatible + +from datastructure.aidDataframe import AIDataFrame + +class ChatWithPandas(AbstractFunction): + + + @setup(cacheable=False, function_type="FeatureExtraction", batchable=False) + def setup(self): + pass + + @property + def name(self) -> str: + return "SentenceTransformerFeatureExtractor" + + @forward( + input_signatures=[ + PandasDataframe( + columns=["data"], + column_types=[NdArrayType.STR], + column_shapes=[(1)], + ), + + ], + output_signatures=[ + PandasDataframe( + columns=["response"], + column_types=[NdArrayType.FLOAT32], + column_shapes=[(1, 384)], + ) + ], + ) + def forward(self, df: pd.DataFrame) -> pd.DataFrame: + + query = df[0][0] + req_df = df.drop([0], axis=1) + + smart_df = AIDataFrame(req_df, description="A dataframe about cars") + smart_df.initialize_middleware() + + response = smart_df.chat(query) + + df_dict = {"response": [response]} + + ans_df = pd.DataFrame(df_dict) + return pd.DataFrame(ans_df) + From bd2baae1b46e3ea9e0c9964789d9502c6060f271 Mon Sep 17 00:00:00 2001 From: Preethi1609 Date: Tue, 3 Oct 2023 18:08:20 -0400 Subject: [PATCH 07/18] add support for local llm models --- config.py | 3 +++ datastructure/aidDataframe.py | 10 +++++++++- functions/semantic_cache.py | 8 +++++--- 3 files changed, 17 insertions(+), 4 deletions(-) diff --git a/config.py b/config.py index 8905a18..6f4e6ec 100644 --- a/config.py +++ b/config.py @@ -1,7 +1,10 @@ class Config: def __init__(self) -> None: self.open_ai_key = "" + self.local_llm_model = "llama-2-7b-chat.ggmlv3.q4_0.bin" def get_open_ai_key(self): return self.open_ai_key + def get_local_llm_model(self): + return self.local_llm_model \ No newline at end of file diff --git a/datastructure/aidDataframe.py b/datastructure/aidDataframe.py index c9093e0..4b0652b 100644 --- a/datastructure/aidDataframe.py +++ b/datastructure/aidDataframe.py @@ -1,6 +1,7 @@ import pandas as pd import numpy as np import openai +from gpt4all import GPT4All from langchain.llms import OpenAI from langchain.agents import create_pandas_dataframe_agent from langchain.chat_models import ChatOpenAI @@ -72,6 +73,11 @@ def initialize_middleware(self): self.openai_model = "text-davinci-003" return + def initialize_local_llm_model(self): + local_llm_model = self.config.get_local_llm_model() + self.local_llm = GPT4All(local_llm_model) + return + def query_dataframe(self, query): if query not in self.cache: ans = self.llm_agent.run(query) @@ -88,7 +94,9 @@ def code_error_correction(self, query, error, old_python_code): return answer - def chat(self, prompt): + def chat(self, prompt, local=False): + if local: + return self.local_llm.generate(prompt) ans = self.llm_agent.run(prompt) return ans diff --git a/functions/semantic_cache.py b/functions/semantic_cache.py index 37a764f..e3664df 100644 --- a/functions/semantic_cache.py +++ b/functions/semantic_cache.py @@ -40,12 +40,14 @@ def name(self) -> str: def forward(self, df: pd.DataFrame) -> pd.DataFrame: query = df[0][0] + print("query is: QQQQ", query) req_df = df.drop([0], axis=1) smart_df = AIDataFrame(req_df, description="A dataframe about cars") - smart_df.initialize_middleware() - - response = smart_df.chat(query) + # smart_df.initialize_middleware() + smart_df.initialize_local_llm_model() + # response = smart_df.chat(query) + response = smart_df.chat(query, local=True) df_dict = {"response": [response]} From 62644e14c76f8876e8d78ad29ce7f31d9dffe2b7 Mon Sep 17 00:00:00 2001 From: Preethi1609 Date: Wed, 4 Oct 2023 16:42:37 -0400 Subject: [PATCH 08/18] change prompt, add vars to eva udf, test local llm --- run_test.sh | 2 + test/test_chat_with_pandas_local_llm.py | 61 +++++++++++++++++++++++++ 2 files changed, 63 insertions(+) create mode 100644 run_test.sh create mode 100644 test/test_chat_with_pandas_local_llm.py diff --git a/run_test.sh b/run_test.sh new file mode 100644 index 0000000..5900f18 --- /dev/null +++ b/run_test.sh @@ -0,0 +1,2 @@ +export PYTHONPATH=$PWD +python3 -m unittest discover test/ diff --git a/test/test_chat_with_pandas_local_llm.py b/test/test_chat_with_pandas_local_llm.py new file mode 100644 index 0000000..d90e42d --- /dev/null +++ b/test/test_chat_with_pandas_local_llm.py @@ -0,0 +1,61 @@ +import unittest +import os +import pandas as pd +import evadb + +class TestEvaDBFunctions(unittest.TestCase): + + def setUp(self): + self.conn = evadb.connect() + self.cursor = self.conn.cursor() + print("Connected to EvaDB") + + create_function_query = f"""CREATE FUNCTION IF NOT EXISTS ChatWithPandas + IMPL './functions/semantic_cache.py' use_local_llm 'True' local_llm_model "llama-2-7b-chat.ggmlv3.q4_0.bin"; + """ + self.cursor.query("DROP FUNCTION IF EXISTS ChatWithPandas;").execute() + self.cursor.query(create_function_query).execute() + print("Created Function") + + create_table_query = """ + CREATE TABLE IF NOT EXISTS CARSDATA( + id INTEGER, + name TEXT(30), + mpg INTEGER, + cyl FLOAT(64,64), + disp FLOAT(64,64), + hp FLOAT(64,64), + drat FLOAT(64,64), + wt FLOAT(64,64), + qsec FLOAT(64,64), + vs FLOAT(64,64), + am FLOAT(64,64), + gear FLOAT(64,64), + carb FLOAT(64,64) + ); + """ + load_data_query = """ LOAD CSV 'data/cars.csv' INTO CARSDATA; + """ + + self.cursor.query(create_table_query).execute() + self.cursor.query(load_data_query).execute() + print("Loaded data") + + def test_mean_of_gear_column(self): + chat_query = "SELECT ChatWithPandas('what is the mean of the gear column', gear, name) FROM CARSDATA;" + result = self.cursor.query(chat_query).execute() + print("RESULTT-", result) + self.assertIsNotNone(result) + + def test_highest_gear_value_car(self): + chat_query = "SELECT ChatWithPandas('which car has the highest gear value', gear, name) FROM CARSDATA;" + result = self.cursor.query(chat_query).execute() + print("RESULTTT2: ", result) + self.assertIsNotNone(result) + + def tearDown(self): + self.cursor.close() + print("Closed EvaDB connection") + +if __name__ == '__main__': + unittest.main() From 149a785d21182b35732a826416b5aa7fab78c4bf Mon Sep 17 00:00:00 2001 From: Preethi1609 Date: Fri, 6 Oct 2023 06:49:04 -0400 Subject: [PATCH 09/18] modify query, add test --- datastructure/aidDataframe.py | 13 ++++-- functions/semantic_cache.py | 11 +++-- test/a_test_chat_with_pandas_open_ai.py | 58 +++++++++++++++++++++++++ 3 files changed, 76 insertions(+), 6 deletions(-) create mode 100644 test/a_test_chat_with_pandas_open_ai.py diff --git a/datastructure/aidDataframe.py b/datastructure/aidDataframe.py index 4b0652b..c890619 100644 --- a/datastructure/aidDataframe.py +++ b/datastructure/aidDataframe.py @@ -73,8 +73,11 @@ def initialize_middleware(self): self.openai_model = "text-davinci-003" return - def initialize_local_llm_model(self): - local_llm_model = self.config.get_local_llm_model() + def initialize_local_llm_model(self, local_llm=None): + if local_llm: + local_llm_model = local_llm + else: + local_llm_model = self.config.get_local_llm_model(local_llm_model) self.local_llm = GPT4All(local_llm_model) return @@ -96,7 +99,11 @@ def code_error_correction(self, query, error, old_python_code): def chat(self, prompt, local=False): if local: - return self.local_llm.generate(prompt) + query = f"""There is a dataframe in pandas (python). The name of the + dataframe is pd_df. This is the result of print(self.pd_df.head()): + {str(self.pd_df.head())}\nAnswer the question: + Question : {prompt}. """ + return self.local_llm.generate(query) ans = self.llm_agent.run(prompt) return ans diff --git a/functions/semantic_cache.py b/functions/semantic_cache.py index e3664df..c6ab064 100644 --- a/functions/semantic_cache.py +++ b/functions/semantic_cache.py @@ -13,7 +13,9 @@ class ChatWithPandas(AbstractFunction): @setup(cacheable=False, function_type="FeatureExtraction", batchable=False) - def setup(self): + def setup(self, use_local_llm=False, local_llm_model=None): + self.use_local_llm = use_local_llm + self.local_llm_model = local_llm_model pass @property @@ -44,8 +46,11 @@ def forward(self, df: pd.DataFrame) -> pd.DataFrame: req_df = df.drop([0], axis=1) smart_df = AIDataFrame(req_df, description="A dataframe about cars") - # smart_df.initialize_middleware() - smart_df.initialize_local_llm_model() + if self.use_local_llm: + smart_df.initialize_local_llm_model(local_llm=self.local_llm_model) + else: + smart_df.initialize_middleware() + # response = smart_df.chat(query) response = smart_df.chat(query, local=True) diff --git a/test/a_test_chat_with_pandas_open_ai.py b/test/a_test_chat_with_pandas_open_ai.py new file mode 100644 index 0000000..aa7ff2a --- /dev/null +++ b/test/a_test_chat_with_pandas_open_ai.py @@ -0,0 +1,58 @@ +import unittest +import os +import pandas as pd +import evadb + +class TestEvaDBFunctions(unittest.TestCase): + + def setUp(self): + self.conn = evadb.connect() + self.cursor = self.conn.cursor() + print("Connected to EvaDB") + + create_function_query = f"""CREATE FUNCTION IF NOT EXISTS ChatWithPandas + IMPL './functions/semantic_cache.py'""" + self.cursor.query("DROP FUNCTION IF EXISTS ChatWithPandas;").execute() + self.cursor.query(create_function_query).execute() + print("Created Function") + + create_table_query = """ + CREATE TABLE IF NOT EXISTS CARSDATA( + id INTEGER, + name TEXT(30), + mpg INTEGER, + cyl FLOAT(64,64), + disp FLOAT(64,64), + hp FLOAT(64,64), + drat FLOAT(64,64), + wt FLOAT(64,64), + qsec FLOAT(64,64), + vs FLOAT(64,64), + am FLOAT(64,64), + gear FLOAT(64,64), + carb FLOAT(64,64) + ); + """ + load_data_query = """ LOAD CSV 'data/cars.csv' INTO CARSDATA; + """ + + self.cursor.query(create_table_query).execute() + self.cursor.query(load_data_query).execute() + print("Loaded data") + + def test_mean_of_gear_column(self): + chat_query = "SELECT ChatWithPandas('what is the mean of the gear column', gear, name) FROM CARSDATA;" + result = self.cursor.query(chat_query).execute() + self.assertIsNotNone(result) + + def test_highest_gear_value_car(self): + chat_query = "SELECT ChatWithPandas('which car has the highest gear value', gear, name) FROM CARSDATA;" + result = self.cursor.query(chat_query).execute() + self.assertIsNotNone(result) + + def tearDown(self): + self.cursor.close() + print("Closed EvaDB connection") + +if __name__ == '__main__': + unittest.main() From e2e34c1c6ebc56e744d8f4a3c2f9acc389d73b9e Mon Sep 17 00:00:00 2001 From: Preethi1609 Date: Fri, 6 Oct 2023 09:04:54 -0400 Subject: [PATCH 10/18] works but not clean --- datastructure/aidDataframe.py | 21 +++++++++++++++------ functions/semantic_cache.py | 12 +++++------- test/test_chat_with_pandas_local_llm.py | 7 ++----- 3 files changed, 22 insertions(+), 18 deletions(-) diff --git a/datastructure/aidDataframe.py b/datastructure/aidDataframe.py index c890619..f09d811 100644 --- a/datastructure/aidDataframe.py +++ b/datastructure/aidDataframe.py @@ -1,6 +1,7 @@ import pandas as pd import numpy as np import openai +import subprocess from gpt4all import GPT4All from langchain.llms import OpenAI from langchain.agents import create_pandas_dataframe_agent @@ -16,6 +17,7 @@ def __init__(self, df, config=None, description=None, name=None) -> None: #initialize pandas dataframe self.pd_df = df + print("pd_df INITTT: \n", str(self.pd_df)) self.config = Config() if len(df)>0: @@ -31,7 +33,8 @@ def __init__(self, df, config=None, description=None, name=None) -> None: self.config = config #set name - self.name = name + if name: + self.name = name #setup cache self.cache = {} @@ -99,12 +102,18 @@ def code_error_correction(self, query, error, old_python_code): def chat(self, prompt, local=False): if local: + print("df HEREEEE\n", str(self.pd_df)) + # query = f"""There is a dataframe in pandas (python). This is the result of print(self.pd_df.head()): + # {str(self.pd_df.head())}\nAnswer the question: {prompt}. """ query = f"""There is a dataframe in pandas (python). The name of the - dataframe is pd_df. This is the result of print(self.pd_df.head()): - {str(self.pd_df.head())}\nAnswer the question: - Question : {prompt}. """ + dataframe is self.pd_df. This is the result of print(self.pd_df):\n + {str(self.pd_df.head())}. Return a python script with comments to get the answer to the following question: {prompt}. Do not write code to load the CSV file.""" + print("QUERYYY QQQQ", query) + # query = f"""There is a dataframe in pandas (python). The name of the + # dataframe is self.pd_df. This is the result of print(self.pd_df.head()): + # {str(self.pd_df.head())}. Return a python script without any other text to the following question: {prompt}. Do not write code to load the CSV file.""" return self.local_llm.generate(query) - ans = self.llm_agent.run(prompt) - return ans + else: + return self.llm_agent.run(prompt) diff --git a/functions/semantic_cache.py b/functions/semantic_cache.py index c6ab064..7183d5b 100644 --- a/functions/semantic_cache.py +++ b/functions/semantic_cache.py @@ -39,23 +39,21 @@ def name(self) -> str: ) ], ) - def forward(self, df: pd.DataFrame) -> pd.DataFrame: - + def forward(self, df: pd.DataFrame): + # print("full df: \n", df) query = df[0][0] print("query is: QQQQ", query) req_df = df.drop([0], axis=1) - + # print("req_df: \n", req_df) smart_df = AIDataFrame(req_df, description="A dataframe about cars") if self.use_local_llm: smart_df.initialize_local_llm_model(local_llm=self.local_llm_model) else: smart_df.initialize_middleware() - - # response = smart_df.chat(query) response = smart_df.chat(query, local=True) - + print("ANSWERRR:", response) df_dict = {"response": [response]} ans_df = pd.DataFrame(df_dict) return pd.DataFrame(ans_df) - + diff --git a/test/test_chat_with_pandas_local_llm.py b/test/test_chat_with_pandas_local_llm.py index d90e42d..63bea99 100644 --- a/test/test_chat_with_pandas_local_llm.py +++ b/test/test_chat_with_pandas_local_llm.py @@ -9,7 +9,6 @@ def setUp(self): self.conn = evadb.connect() self.cursor = self.conn.cursor() print("Connected to EvaDB") - create_function_query = f"""CREATE FUNCTION IF NOT EXISTS ChatWithPandas IMPL './functions/semantic_cache.py' use_local_llm 'True' local_llm_model "llama-2-7b-chat.ggmlv3.q4_0.bin"; """ @@ -44,14 +43,12 @@ def setUp(self): def test_mean_of_gear_column(self): chat_query = "SELECT ChatWithPandas('what is the mean of the gear column', gear, name) FROM CARSDATA;" result = self.cursor.query(chat_query).execute() - print("RESULTT-", result) - self.assertIsNotNone(result) + print("RESULTT", result) def test_highest_gear_value_car(self): chat_query = "SELECT ChatWithPandas('which car has the highest gear value', gear, name) FROM CARSDATA;" result = self.cursor.query(chat_query).execute() - print("RESULTTT2: ", result) - self.assertIsNotNone(result) + print("RESULTT", result) def tearDown(self): self.cursor.close() From f1b5c8b249af3c004abfad0d893c57551549386b Mon Sep 17 00:00:00 2001 From: Preethi1609 Date: Fri, 6 Oct 2023 10:09:10 -0400 Subject: [PATCH 11/18] clean, return python script --- chat_runner_local.py | 51 +++++++++++++++++++++++++++++++++++ datastructure/aidDataframe.py | 19 +++++++------ functions/semantic_cache.py | 16 +++++++---- 3 files changed, 73 insertions(+), 13 deletions(-) create mode 100644 chat_runner_local.py diff --git a/chat_runner_local.py b/chat_runner_local.py new file mode 100644 index 0000000..73cdf8d --- /dev/null +++ b/chat_runner_local.py @@ -0,0 +1,51 @@ +import os +import pandas as pd +import evadb + +cursor = evadb.connect().cursor() +print("Connected to EvaDB") + +# create_function_query = f"""CREATE FUNCTION IF NOT EXISTS ChatWithPandas +# IMPL './functions/semantic_cache.py'; +# """ +create_function_query = f"""CREATE FUNCTION IF NOT EXISTS ChatWithPandas + IMPL './functions/semantic_cache.py' use_local_llm 'True' local_llm_model "llama-2-7b-chat.ggmlv3.q4_0.bin"; + """ +cursor.query("DROP FUNCTION IF EXISTS ChatWithPandas;").execute() +cursor.query(create_function_query).execute() +print("Created Function") + +create_table_query = f""" +CREATE TABLE IF NOT EXISTS CARSDATA( +id INTEGER, +name TEXT(30), +mpg INTEGER, +cyl FLOAT(64,64), +disp FLOAT(64,64), +hp FLOAT(64,64), +drat FLOAT(64,64), +wt FLOAT(64,64), +qsec FLOAT(64,64), +vs FLOAT(64,64), +am FLOAT(64,64), +gear FLOAT(64,64), +carb FLOAT(64,64) +); +""" +load_data_query = f""" LOAD CSV 'data/cars.csv' INTO CARSDATA; +""" + +cursor.query(create_table_query).execute() +cursor.query(load_data_query).execute() +print("loaded data") + +chat_query1 = f""" SELECT ChatWithPandas('what is the mean of the gear column',gear, name) FROM CARSDATA; +""" + +result1 = cursor.query(chat_query1).execute() +print(result1) + +chat_query2 = f""" SELECT ChatWithPandas('which car has the highest gear value',gear, name) FROM CARSDATA; +""" +result2 = cursor.query(chat_query2).execute() +print(result2) \ No newline at end of file diff --git a/datastructure/aidDataframe.py b/datastructure/aidDataframe.py index f09d811..d7ef192 100644 --- a/datastructure/aidDataframe.py +++ b/datastructure/aidDataframe.py @@ -102,17 +102,20 @@ def code_error_correction(self, query, error, old_python_code): def chat(self, prompt, local=False): if local: - print("df HEREEEE\n", str(self.pd_df)) - # query = f"""There is a dataframe in pandas (python). This is the result of print(self.pd_df.head()): - # {str(self.pd_df.head())}\nAnswer the question: {prompt}. """ - query = f"""There is a dataframe in pandas (python). The name of the - dataframe is self.pd_df. This is the result of print(self.pd_df):\n - {str(self.pd_df.head())}. Return a python script with comments to get the answer to the following question: {prompt}. Do not write code to load the CSV file.""" - print("QUERYYY QQQQ", query) + # print("QUERYYY QQQQ", query) + # query = f"""There is a dataframe in pandas (python). The name of the + # dataframe is self.pd_df. This is the result of print(self.pd_df):\n + # {str(self.pd_df.head())}. Return a python script without any other text to get the answer to the following question: {prompt}. Do not write code to load the CSV file.""" + # print("QUERYYY QQQQ", query) + # query = f"""There is a dataframe in pandas (python). The name of the + # dataframe is self.pd_df. This is the result of print(self.pd_df):\n + # {str(self.pd_df.head())}. Please provide a Python script without any introductory text answer the question: {prompt}. Do not write code to load the CSV file.""" + # query = f"""There is a dataframe in pandas (python). The name of the # dataframe is self.pd_df. This is the result of print(self.pd_df.head()): # {str(self.pd_df.head())}. Return a python script without any other text to the following question: {prompt}. Do not write code to load the CSV file.""" - return self.local_llm.generate(query) + return self.local_llm.generate(prompt) + else: return self.llm_agent.run(prompt) diff --git a/functions/semantic_cache.py b/functions/semantic_cache.py index 7183d5b..fb16b02 100644 --- a/functions/semantic_cache.py +++ b/functions/semantic_cache.py @@ -40,18 +40,24 @@ def name(self) -> str: ], ) def forward(self, df: pd.DataFrame): - # print("full df: \n", df) query = df[0][0] - print("query is: QQQQ", query) req_df = df.drop([0], axis=1) - # print("req_df: \n", req_df) smart_df = AIDataFrame(req_df, description="A dataframe about cars") if self.use_local_llm: smart_df.initialize_local_llm_model(local_llm=self.local_llm_model) + prompt = f"""There is a dataframe in pandas (python). This is the result of print(req_df.head()):\n + {str(req_df.head())}. Answer to the following question: {query}.""" + print("PROMPTT", prompt) + response = smart_df.chat(prompt, local=self.use_local_llm) + script = response.split("```")[1] + # script = response + load_df = f"import pandas as pd\ndf = pd.read_csv('/home/preethi/projects/pandas-ai-integration/data/cars.csv')\n" + print(load_df + "\n" + script) + ans = load_df + "\n" + script + print("ANSWERRR/n", ans) else: smart_df.initialize_middleware() - response = smart_df.chat(query, local=True) - print("ANSWERRR:", response) + response = smart_df.chat(query, local=self.use_local_llm) df_dict = {"response": [response]} ans_df = pd.DataFrame(df_dict) From d1d5b19e99c197b97541290281d558f5fb9c2037 Mon Sep 17 00:00:00 2001 From: Preethi1609 Date: Fri, 6 Oct 2023 10:36:41 -0400 Subject: [PATCH 12/18] modify prompt? --- chat_runner_local.py | 2 +- functions/semantic_cache.py | 14 +++++++------- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/chat_runner_local.py b/chat_runner_local.py index 73cdf8d..dda25bc 100644 --- a/chat_runner_local.py +++ b/chat_runner_local.py @@ -9,7 +9,7 @@ # IMPL './functions/semantic_cache.py'; # """ create_function_query = f"""CREATE FUNCTION IF NOT EXISTS ChatWithPandas - IMPL './functions/semantic_cache.py' use_local_llm 'True' local_llm_model "llama-2-7b-chat.ggmlv3.q4_0.bin"; + IMPL './functions/semantic_cache.py' use_local_llm 'True' local_llm_model "llama-2-7b-chat.ggmlv3.q4_0.bin" csv_path "./data/cars.csv"; """ cursor.query("DROP FUNCTION IF EXISTS ChatWithPandas;").execute() cursor.query(create_function_query).execute() diff --git a/functions/semantic_cache.py b/functions/semantic_cache.py index fb16b02..6bbdb84 100644 --- a/functions/semantic_cache.py +++ b/functions/semantic_cache.py @@ -1,6 +1,6 @@ import pandas as pd - +import subprocess from evadb.catalog.catalog_type import NdArrayType from evadb.functions.abstract.abstract_function import AbstractFunction from evadb.functions.decorators.decorators import forward, setup @@ -13,9 +13,10 @@ class ChatWithPandas(AbstractFunction): @setup(cacheable=False, function_type="FeatureExtraction", batchable=False) - def setup(self, use_local_llm=False, local_llm_model=None): + def setup(self, use_local_llm=False, local_llm_model=None, csv_path=None): self.use_local_llm = use_local_llm self.local_llm_model = local_llm_model + self.csv_path = csv_path pass @property @@ -46,15 +47,14 @@ def forward(self, df: pd.DataFrame): if self.use_local_llm: smart_df.initialize_local_llm_model(local_llm=self.local_llm_model) prompt = f"""There is a dataframe in pandas (python). This is the result of print(req_df.head()):\n - {str(req_df.head())}. Answer to the following question: {query}.""" + {str(req_df.head())}. Return a python script to get the answer to the following question: {query}.""" print("PROMPTT", prompt) response = smart_df.chat(prompt, local=self.use_local_llm) - script = response.split("```")[1] + script = response.split("```")[1].lstrip("python") # script = response - load_df = f"import pandas as pd\ndf = pd.read_csv('/home/preethi/projects/pandas-ai-integration/data/cars.csv')\n" - print(load_df + "\n" + script) + load_df = f"import pandas as pd\nreq_df = pd.read_csv('{self.csv_path}')\n" ans = load_df + "\n" + script - print("ANSWERRR/n", ans) + print("ANSWERRR\n", ans) else: smart_df.initialize_middleware() response = smart_df.chat(query, local=self.use_local_llm) From c6f1c8d35cfbe09926e9e4f624ef22503c451e10 Mon Sep 17 00:00:00 2001 From: Preethi1609 Date: Wed, 25 Oct 2023 15:54:55 -0400 Subject: [PATCH 13/18] dummy csv --- data/Airbnb/missing_values/dummy.csv | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 data/Airbnb/missing_values/dummy.csv diff --git a/data/Airbnb/missing_values/dummy.csv b/data/Airbnb/missing_values/dummy.csv new file mode 100644 index 0000000..833ec7f --- /dev/null +++ b/data/Airbnb/missing_values/dummy.csv @@ -0,0 +1,2 @@ +Bathrooms,Bedrooms,Beds,LocationName,NumGuests,NumReviews,Price,Rating,latitude,longitude,zipcode,pop2016,pop2010,pop2000,cost_living_index (US avg. = 100),land_area (sq.mi.),water_area (sq.mi.),pop_density (people per mile),number of males,number of females,prop taxes paid 2016,median taxes (with mortgage,median taxes (no mortgage),median house value,median houshold income,median monthly owner costs (with mortgage),median monthly owner costs (no mortgage),median gross rent,median asking price for vacant for-sale home/condo,unemployment (%),Number of Homes,Count of Abnb,Density of Abnb (%),Average Abnb Price (by zipcode),Average NumReviews (by zipcode),Average Rating (by zipcode),Average Number of Bathrooms (by zipcode),Average Number of Bedrooms (by zipcode),Average Number of Beds (by zipcode),Average Number of Guests (by zipcode) +3.0,4.0,5.0,Atlanta,10.0,19.0,795.0,Y,33.76088,-84.36917,30308,17280.0,15413.0,11796,98.0,1.6,0.0,10836,10075,7205,1.2,3155.0,2380.0,259718.0,59088.0,1713.0,665.0,1162.0,326958.0,4.6,6912.0,210,3.038194444,141.4285714,36.27329193,4.880794702,1.285714286,1.494680851,1.933333333,1.933333333 From 5b6143aa6ebc14b972e6ae419e495fd1368d0e94 Mon Sep 17 00:00:00 2001 From: Preethi1609 Date: Wed, 25 Oct 2023 21:53:28 -0400 Subject: [PATCH 14/18] benchmark just drop NaN columns --- benchmark.py | 118 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 118 insertions(+) create mode 100644 benchmark.py diff --git a/benchmark.py b/benchmark.py new file mode 100644 index 0000000..a1b860f --- /dev/null +++ b/benchmark.py @@ -0,0 +1,118 @@ +import os +import pandas as pd +import evadb +import pandas as pd +from sklearn.model_selection import train_test_split +from sklearn.preprocessing import StandardScaler +from sklearn.linear_model import LogisticRegression +from sklearn.metrics import accuracy_score, f1_score + +cursor = evadb.connect().cursor() +print("Connected to EvaDB") +#local +# create_function_query = f"""CREATE FUNCTION IF NOT EXISTS ChatWithPandas +# IMPL './functions/chat_with_df.py' use_local_llm 'True' local_llm_model "llama-2-7b-chat.ggmlv3.q4_0.bin" csv_path "./data/cars.csv"; +# """ + +create_function_query = f"""CREATE FUNCTION IF NOT EXISTS ChatWithPandas + IMPL './functions/chat_with_df.py'; + """ + +cursor.query("DROP FUNCTION IF EXISTS ChatWithPandas;").execute() + +cursor.query(create_function_query).execute() +print("Created Function") + +create_table_query = f"""CREATE TABLE IF NOT EXISTS AIRBNB_DATA5( + Bathrooms FLOAT(64, 64), + Bedrooms FLOAT(64, 64), + Beds FLOAT(64, 64), + LocationName TEXT(255), + NumGuests FLOAT(64, 64), + NumReviews FLOAT(64, 64), + Price FLOAT(64, 64), + Rating TEXT(225), + latitude FLOAT(64, 64), + longitude FLOAT(64, 64), + zipcode TEXT(10), + pop2016 FLOAT(64, 64), + pop2010 FLOAT(64, 64), + pop2000 FLOAT(64, 64), + cost_living_index FLOAT(64, 64), + land_area FLOAT(64, 64), + water_area FLOAT(64, 64), + pop_density INTEGER, + number_of_males INTEGER, + number_of_females INTEGER, + prop_taxes_paid_2016 FLOAT(64, 64), + median_taxes_with_mortgage FLOAT(64, 64), + median_taxes_no_mortgage FLOAT(64, 64), + median_house_value FLOAT(64, 64), + median_household_income FLOAT(64, 64), + median_monthly_owner_costs_with_mortgage FLOAT(64, 64), + median_monthly_owner_costs_no_mortgage FLOAT(64, 64), + median_gross_rent FLOAT(64, 64), + median_asking_price_for_sale_home_condo FLOAT(64, 64), + unemployment FLOAT(64, 64), + number_of_homes INTEGER, + count_of_abnb INTEGER, + density_of_abnb FLOAT(64, 64), + avg_abnb_price_by_zipcode FLOAT(64, 64), + avg_num_reviews_by_zipcode FLOAT(64, 64), + avg_rating_by_zipcode FLOAT(64, 64), + avg_num_bathrooms_by_zipcode FLOAT(64, 64), + avg_num_bedrooms_by_zipcode FLOAT(64, 64), + avg_num_beds_by_zipcode FLOAT(64, 64), + avg_num_guests_by_zipcode FLOAT(64, 64) +); """ + +load_data_query = f""" LOAD CSV 'data/Airbnb/missing_values/dirty_test1.csv' INTO AIRBNB_DATA5;""" +cursor.query(create_table_query).df() +cursor.query(load_data_query).df() +print("loaded data") + + +data = pd.read_csv('data/Airbnb/missing_values/dirty_test1.csv') + +#clean using llm +# query = f""" SELECT ChatWithPandas('cleaning',\ +# 'impute null values with average of the column if an integer or float. replace with an empty string if column is a string.\ +# remove duplicate rows.', \ +# id, name, mpg, cyl, disp, hp, drat, wt, qsec, vs, am, gear, carb) FROM AIRBNB_DATA5; +# """ +# cleaned_df = cursor.query(query).execute() +#clean ends here + + +data = data.dropna() +# Identify categorical columns +categorical_cols = data.select_dtypes(include=['object']).columns + +data = pd.get_dummies(data, columns=categorical_cols) +data.dropna() + +# Split features and labels +X = data.iloc[:, :-1].values +y = data.iloc[:, -1].values + +X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) + +X_train = X_train.astype(float) +X_test = X_test.astype(float) +y_train = y_train.astype(float) +y_test = y_test.astype(float) + +scaler = StandardScaler() +X_train = scaler.fit_transform(X_train) +X_test = scaler.transform(X_test) + +model = LogisticRegression() +model.fit(X_train, y_train) + +y_pred = model.predict(X_test) + +accuracy = accuracy_score(y_test, y_pred) +f1 = f1_score(y_test, y_pred) + +print(f"Accuracy: {accuracy:.2f}") +print(f"F1 Score: {f1:.2f}") From 09fff6098c4df37084144e730e38d9621b7f0d15 Mon Sep 17 00:00:00 2001 From: Preethi1609 Date: Wed, 25 Oct 2023 22:39:03 -0400 Subject: [PATCH 15/18] add step to clean using llm --- benchmark.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/benchmark.py b/benchmark.py index a1b860f..38945c7 100644 --- a/benchmark.py +++ b/benchmark.py @@ -75,12 +75,13 @@ data = pd.read_csv('data/Airbnb/missing_values/dirty_test1.csv') #clean using llm -# query = f""" SELECT ChatWithPandas('cleaning',\ -# 'impute null values with average of the column if an integer or float. replace with an empty string if column is a string.\ -# remove duplicate rows.', \ -# id, name, mpg, cyl, disp, hp, drat, wt, qsec, vs, am, gear, carb) FROM AIRBNB_DATA5; -# """ -# cleaned_df = cursor.query(query).execute() + +query = f""" SELECT ChatWithPandas('cleaning',\ + 'impute null values with average of the column if an integer or float. replace with an empty string if column is a string.\ + remove duplicate rows.', \ + Bathrooms, Bedrooms, Beds, LocationName, NumGuests, NumReviews, Price, Rating, latitude, longitude, zipcode, pop2016, pop2010, pop2000, cost_living_index, land_area, water_area, pop_density, number_of_males, number_of_females, prop_taxes_paid_2016, median_taxes_with_mortgage, median_taxes_no_mortgage, median_house_value, median_household_income, median_monthly_owner_costs_with_mortgage, median_monthly_owner_costs_no_mortgage, median_gross_rent, median_asking_price_for_sale_home_condo, unemployment, number_of_homes, count_of_abnb, density_of_abnb, avg_abnb_price_by_zipcode, avg_num_reviews_by_zipcode, avg_rating_by_zipcode, avg_num_bathrooms_by_zipcode, avg_num_bedrooms_by_zipcode, avg_num_beds_by_zipcode, avg_num_guests_by_zipcode) FROM AIRBNB_DATA5; +""" +data = cursor.query(query).execute() #clean ends here From 022623209c86f53f1f854ecb7fe9aef4d2eed5d6 Mon Sep 17 00:00:00 2001 From: Preethi1609 Date: Wed, 25 Oct 2023 22:53:19 -0400 Subject: [PATCH 16/18] comment out dropna --- benchmark.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmark.py b/benchmark.py index 38945c7..756643c 100644 --- a/benchmark.py +++ b/benchmark.py @@ -85,7 +85,7 @@ #clean ends here -data = data.dropna() +# data = data.dropna() # Identify categorical columns categorical_cols = data.select_dtypes(include=['object']).columns From 16761e745a6f6420d67b7d4c9b3208640826c23d Mon Sep 17 00:00:00 2001 From: Preethi1609 Date: Fri, 27 Oct 2023 09:07:38 -0400 Subject: [PATCH 17/18] benchmark diff csvs --- benchmark.py | 108 +++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 78 insertions(+), 30 deletions(-) diff --git a/benchmark.py b/benchmark.py index 756643c..94decf5 100644 --- a/benchmark.py +++ b/benchmark.py @@ -2,6 +2,9 @@ import pandas as pd import evadb import pandas as pd +import torch +import torch.nn as nn +import torch.optim as optim from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler from sklearn.linear_model import LogisticRegression @@ -24,14 +27,14 @@ print("Created Function") create_table_query = f"""CREATE TABLE IF NOT EXISTS AIRBNB_DATA5( - Bathrooms FLOAT(64, 64), - Bedrooms FLOAT(64, 64), - Beds FLOAT(64, 64), - LocationName TEXT(255), - NumGuests FLOAT(64, 64), - NumReviews FLOAT(64, 64), - Price FLOAT(64, 64), - Rating TEXT(225), + bathrooms FLOAT(64, 64), + bedrooms FLOAT(64, 64), + beds FLOAT(64, 64), + location_name TEXT(255), + num_guests FLOAT(64, 64), + num_reviews FLOAT(64, 64), + price FLOAT(64, 64), + rating TEXT(225), latitude FLOAT(64, 64), longitude FLOAT(64, 64), zipcode TEXT(10), @@ -72,16 +75,18 @@ print("loaded data") -data = pd.read_csv('data/Airbnb/missing_values/dirty_test1.csv') +# data = pd.read_csv('data/Airbnb/missing_values/dirty_test1.csv') +data = pd.read_csv('cleaned_dfs/cleaned_df_int.csv') +# data = pd.read_csv('cleaned_df.csv') #clean using llm + # remove duplicate rows.', \ -query = f""" SELECT ChatWithPandas('cleaning',\ - 'impute null values with average of the column if an integer or float. replace with an empty string if column is a string.\ - remove duplicate rows.', \ - Bathrooms, Bedrooms, Beds, LocationName, NumGuests, NumReviews, Price, Rating, latitude, longitude, zipcode, pop2016, pop2010, pop2000, cost_living_index, land_area, water_area, pop_density, number_of_males, number_of_females, prop_taxes_paid_2016, median_taxes_with_mortgage, median_taxes_no_mortgage, median_house_value, median_household_income, median_monthly_owner_costs_with_mortgage, median_monthly_owner_costs_no_mortgage, median_gross_rent, median_asking_price_for_sale_home_condo, unemployment, number_of_homes, count_of_abnb, density_of_abnb, avg_abnb_price_by_zipcode, avg_num_reviews_by_zipcode, avg_rating_by_zipcode, avg_num_bathrooms_by_zipcode, avg_num_bedrooms_by_zipcode, avg_num_beds_by_zipcode, avg_num_guests_by_zipcode) FROM AIRBNB_DATA5; -""" -data = cursor.query(query).execute() +# query = f""" SELECT ChatWithPandas('cleaning',\ +# 'impute null values with average of the column if an integer or float. replace with an empty string if column is a string.',\ +# Bathrooms, Bedrooms, Beds, Location_Name, Num_Guests, Num_Reviews, Price, Rating, latitude, longitude, zipcode, pop2016, pop2010, pop2000, cost_living_index, land_area, water_area, pop_density, number_of_males, number_of_females, prop_taxes_paid_2016, median_taxes_with_mortgage, median_taxes_no_mortgage, median_house_value, median_household_income, median_monthly_owner_costs_with_mortgage, median_monthly_owner_costs_no_mortgage, median_gross_rent, median_asking_price_for_sale_home_condo, unemployment, number_of_homes, count_of_abnb, density_of_abnb, avg_abnb_price_by_zipcode, avg_num_reviews_by_zipcode, avg_rating_by_zipcode, avg_num_bathrooms_by_zipcode, avg_num_bedrooms_by_zipcode, avg_num_beds_by_zipcode, avg_num_guests_by_zipcode) FROM AIRBNB_DATA5; +# """ +# data = cursor.query(query).execute() #clean ends here @@ -90,7 +95,7 @@ categorical_cols = data.select_dtypes(include=['object']).columns data = pd.get_dummies(data, columns=categorical_cols) -data.dropna() +# data.dropna() # Split features and labels X = data.iloc[:, :-1].values @@ -103,17 +108,60 @@ y_train = y_train.astype(float) y_test = y_test.astype(float) -scaler = StandardScaler() -X_train = scaler.fit_transform(X_train) -X_test = scaler.transform(X_test) - -model = LogisticRegression() -model.fit(X_train, y_train) - -y_pred = model.predict(X_test) - -accuracy = accuracy_score(y_test, y_pred) -f1 = f1_score(y_test, y_pred) - -print(f"Accuracy: {accuracy:.2f}") -print(f"F1 Score: {f1:.2f}") +# Convert to torch tensors +X_train_tensor = torch.FloatTensor(X_train) +X_test_tensor = torch.FloatTensor(X_test) +y_train_tensor = torch.FloatTensor(y_train) +y_test_tensor = torch.FloatTensor(y_test) + +# Define a simple logistic regression model +class LogisticRegression(nn.Module): + def __init__(self, input_dim): + super(LogisticRegression, self).__init__() + self.linear = nn.Linear(input_dim, 1) + + def forward(self, x): + return torch.sigmoid(self.linear(x)) + +input_dim = X_train.shape[1] +model = LogisticRegression(input_dim) + +# Loss and optimizer +criterion = nn.CrossEntropyLoss() +optimizer = optim.SGD(model.parameters(), lr=0.01) + +# Train the model +epochs = 50 +for epoch in range(epochs): + model.train() + optimizer.zero_grad() + outputs = model(X_train_tensor).squeeze() + loss = criterion(outputs, y_train_tensor) + loss.backward() + optimizer.step() + +# Compute accuracy +model.eval() +with torch.no_grad(): + predictions = model(X_test_tensor).squeeze() + predictions = (predictions > 0.5).float() + correct = (predictions == y_test_tensor).float().sum() + accuracy = correct / len(y_test_tensor) + print("accuracyy:", accuracy) + + + +# scaler = StandardScaler() +# X_train = scaler.fit_transform(X_train) +# X_test = scaler.transform(X_test) + +# model = LogisticRegression() +# model.fit(X_train, y_train) + +# y_pred = model.predict(X_test) + +# accuracy = accuracy_score(y_test, y_pred) +# f1 = f1_score(y_test, y_pred) + +# print(f"Accuracy: {accuracy:.2f}") +# print(f"F1 Score: {f1:.2f}") From 39dc712feb95de524984b242e4af0b022a299321 Mon Sep 17 00:00:00 2001 From: Preethi1609 Date: Fri, 27 Oct 2023 13:12:14 -0400 Subject: [PATCH 18/18] analysis script --- analyse.py | 11 +++++++++++ 1 file changed, 11 insertions(+) create mode 100644 analyse.py diff --git a/analyse.py b/analyse.py new file mode 100644 index 0000000..9d12931 --- /dev/null +++ b/analyse.py @@ -0,0 +1,11 @@ +import pandas as pd + +dirty_test_df = pd.read_csv("/home/preethi/projects/pandas-ai-integration/data/Airbnb/missing_values/dirty_test.csv") +print("Dirty test len: ", len(dirty_test_df)) + +delete_test_df = pd.read_csv("/home/preethi/projects/pandas-ai-integration/data/Airbnb/missing_values/dirty_train.csv") +print("Dirty train len: ", len(delete_test_df)) + +cleaned_df = pd.read_csv("/home/preethi/projects/pandas-ai-integration/cleaned_df.csv") +print("Cleaned df: ", len(cleaned_df)) +# Get the number of columns