06 chat with csv using create_csv_agent from langchain

In [ ]:

from openai import OpenAI  # Works only with openai version >= 1.2.0
from dotenv import load_dotenv,find_dotenv
import pandas as pd
load_dotenv(find_dotenv())
client = OpenAI()
from langchain.agents.agent_types import AgentType
from langchain.chat_models import ChatOpenAI
from langchain.llms import OpenAI
from langchain_experimental.agents.agent_toolkits import create_csv_agent

In [ ]:

agent = create_csv_agent(
    OpenAI(temperature=0),
    ["train.csv", "test.csv"], #or a single file without big brackets
    verbose=True,
    agent_type=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
)
agent

Out[ ]:

AgentExecutor(verbose=True, agent=ZeroShotAgent(llm_chain=LLMChain(prompt=PromptTemplate(input_variables=['agent_scratchpad', 'input'], partial_variables={'num_dfs': '2', 'dfs_head': '|    | id        |   vendor_id | pickup_datetime     | dropoff_datetime    |   passenger_count |   pickup_longitude |   pickup_latitude |   dropoff_longitude |   dropoff_latitude | store_and_fwd_flag   |   trip_duration |\n|---:|:----------|------------:|:--------------------|:--------------------|------------------:|-------------------:|------------------:|--------------------:|-------------------:|:---------------------|----------------:|\n|  0 | id2875421 |           2 | 2016-03-14 17:24:55 | 2016-03-14 17:32:30 |                 1 |           -73.9822 |           40.7679 |            -73.9646 |            40.7656 | N                    |             455 |\n|  1 | id2377394 |           1 | 2016-06-12 00:43:35 | 2016-06-12 00:54:38 |                 1 |           -73.9804 |           40.7386 |            -73.9995 |            40.7312 | N                    |             663 |\n|  2 | id3858529 |           2 | 2016-01-19 11:35:24 | 2016-01-19 12:10:48 |                 1 |           -73.979  |           40.7639 |            -74.0053 |            40.7101 | N                    |            2124 |\n|  3 | id3504673 |           2 | 2016-04-06 19:32:31 | 2016-04-06 19:39:40 |                 1 |           -74.01   |           40.72   |            -74.0123 |            40.7067 | N                    |             429 |\n|  4 | id2181028 |           2 | 2016-03-26 13:30:55 | 2016-03-26 13:38:10 |                 1 |           -73.9731 |           40.7932 |            -73.9729 |            40.7825 | N                    |             435 |\n\n|    | id        |   vendor_id | pickup_datetime     |   passenger_count |   pickup_longitude |   pickup_latitude |   dropoff_longitude |   dropoff_latitude | store_and_fwd_flag   |\n|---:|:----------|------------:|:--------------------|------------------:|-------------------:|------------------:|--------------------:|-------------------:|:---------------------|\n|  0 | id3004672 |           1 | 2016-06-30 23:59:58 |                 1 |           -73.9881 |           40.732  |            -73.9902 |            40.7567 | N                    |\n|  1 | id3505355 |           1 | 2016-06-30 23:59:53 |                 1 |           -73.9642 |           40.68   |            -73.9598 |            40.6554 | N                    |\n|  2 | id1217141 |           1 | 2016-06-30 23:59:47 |                 1 |           -73.9974 |           40.7376 |            -73.9862 |            40.7295 | N                    |\n|  3 | id2150126 |           2 | 2016-06-30 23:59:41 |                 1 |           -73.9561 |           40.7719 |            -73.9864 |            40.7305 | N                    |\n|  4 | id1598245 |           1 | 2016-06-30 23:59:33 |                 1 |           -73.9702 |           40.7615 |            -73.9615 |            40.7559 | N                    |'}, template='\nYou are working with {num_dfs} pandas dataframes in Python named df1, df2, etc. You \nshould use the tools below to answer the question posed of you:\n\npython_repl_ast: A Python shell. Use this to execute python commands. Input should be a valid python command. When using this tool, sometimes output is abbreviated - make sure it does not look abbreviated before using it in your answer.\n\nUse the following format:\n\nQuestion: the input question you must answer\nThought: you should always think about what to do\nAction: the action to take, should be one of [python_repl_ast]\nAction Input: the input to the action\nObservation: the result of the action\n... (this Thought/Action/Action Input/Observation can repeat N times)\nThought: I now know the final answer\nFinal Answer: the final answer to the original input question\n\n\nThis is the result of `print(df.head())` for each dataframe:\n{dfs_head}\n\nBegin!\nQuestion: {input}\n{agent_scratchpad}'), llm=OpenAI(client=<openai.resources.completions.Completions object at 0x0000020D1E785C30>, async_client=<openai.resources.completions.AsyncCompletions object at 0x0000020D42864400>, temperature=0.0, openai_api_key='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', openai_proxy='')), output_parser=MRKLOutputParser(), allowed_tools=['python_repl_ast']), tools=[PythonAstREPLTool(locals={'df1':                 id  vendor_id      pickup_datetime     dropoff_datetime  \
0        id2875421          2  2016-03-14 17:24:55  2016-03-14 17:32:30   
1        id2377394          1  2016-06-12 00:43:35  2016-06-12 00:54:38   
2        id3858529          2  2016-01-19 11:35:24  2016-01-19 12:10:48   
3        id3504673          2  2016-04-06 19:32:31  2016-04-06 19:39:40   
4        id2181028          2  2016-03-26 13:30:55  2016-03-26 13:38:10   
...            ...        ...                  ...                  ...   
1458639  id2376096          2  2016-04-08 13:31:04  2016-04-08 13:44:02   
1458640  id1049543          1  2016-01-10 07:35:15  2016-01-10 07:46:10   
1458641  id2304944          2  2016-04-22 06:57:41  2016-04-22 07:10:25   
1458642  id2714485          1  2016-01-05 15:56:26  2016-01-05 16:02:39   
1458643  id1209952          1  2016-04-05 14:44:25  2016-04-05 14:47:43   

         passenger_count  pickup_longitude  pickup_latitude  \
0                      1        -73.982155        40.767937   
1                      1        -73.980415        40.738564   
2                      1        -73.979027        40.763939   
3                      1        -74.010040        40.719971   
4                      1        -73.973053        40.793209   
...                  ...               ...              ...   
1458639                4        -73.982201        40.745522   
1458640                1        -74.000946        40.747379   
1458641                1        -73.959129        40.768799   
1458642                1        -73.982079        40.749062   
1458643                1        -73.979538        40.781750   

         dropoff_longitude  dropoff_latitude store_and_fwd_flag  trip_duration  
0               -73.964630         40.765602                  N            455  
1               -73.999481         40.731152                  N            663  
2               -74.005333         40.710087                  N           2124  
3               -74.012268         40.706718                  N            429  
4               -73.972923         40.782520                  N            435  
...                    ...               ...                ...            ...  
1458639         -73.994911         40.740170                  N            778  
1458640         -73.970184         40.796547                  N            655  
1458641         -74.004433         40.707371                  N            764  
1458642         -73.974632         40.757107                  N            373  
1458643         -73.972809         40.790585                  N            198  

[1458644 rows x 11 columns], 'df2':                id  vendor_id      pickup_datetime  passenger_count  \
0       id3004672          1  2016-06-30 23:59:58                1   
1       id3505355          1  2016-06-30 23:59:53                1   
2       id1217141          1  2016-06-30 23:59:47                1   
3       id2150126          2  2016-06-30 23:59:41                1   
4       id1598245          1  2016-06-30 23:59:33                1   
...           ...        ...                  ...              ...   
625129  id3008929          1  2016-01-01 00:02:52                1   
625130  id3700764          1  2016-01-01 00:01:52                1   
625131  id2568735          1  2016-01-01 00:01:24                2   
625132  id1384355          1  2016-01-01 00:00:28                1   
625133  id0621643          2  2016-01-01 00:00:22                2   

        pickup_longitude  pickup_latitude  dropoff_longitude  \
0             -73.988129        40.732029         -73.990173   
1             -73.964203        40.679993         -73.959808   
2             -73.997437        40.737583         -73.986160   
3             -73.956070        40.771900         -73.986427   
4             -73.970215        40.761475         -73.961510   
...                  ...              ...                ...   
625129        -74.003464        40.725105         -74.001251   
625130        -74.006363        40.743782         -73.953407   
625131        -73.972267        40.759865         -73.876602   
625132        -73.976501        40.733562         -73.854263   
625133        -73.981850        40.716881         -73.969330   

        dropoff_latitude store_and_fwd_flag  
0              40.756680                  N  
1              40.655403                  N  
2              40.729523                  N  
3              40.730469                  N  
4              40.755890                  N  
...                  ...                ...  
625129         40.733643                  N  
625130         40.782467                  N  
625131         40.748665                  N  
625132         40.891788                  N  
625133         40.769379                  N  

[625134 rows x 9 columns]})])

In [ ]:

agent.run("how many rows in the age column are different between the two dfs?")


> Entering new AgentExecutor chain...
Thought: I need to compare the two dataframes
Action: python_repl_ast
Action Input: df1['id'].isin(df2['id']).value_counts()
Observation: False    1458644
Name: id, dtype: int64
Thought: I now know the final answer
Final Answer: 1458644

> Finished chain.

Out[ ]:

'1458644'

The Github Experiment -By Ajay Gupta

Search

Wednesday, 22 November 2023

Chat with CSV using create_csv_agent from langchain