In [ ]:
from openai import OpenAI # Works only with openai version >= 1.2.0
from dotenv import load_dotenv,find_dotenv
import pandas as pd
load_dotenv(find_dotenv())
client = OpenAI()
from langchain.agents.agent_types import AgentType
from langchain.chat_models import ChatOpenAI
from langchain.llms import OpenAI
from langchain_experimental.agents.agent_toolkits import create_csv_agent
In [ ]:
agent = create_csv_agent(
OpenAI(temperature=0),
["train.csv", "test.csv"], #or a single file without big brackets
verbose=True,
agent_type=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
)
agent
Out[ ]:
AgentExecutor(verbose=True, agent=ZeroShotAgent(llm_chain=LLMChain(prompt=PromptTemplate(input_variables=['agent_scratchpad', 'input'], partial_variables={'num_dfs': '2', 'dfs_head': '| | id | vendor_id | pickup_datetime | dropoff_datetime | passenger_count | pickup_longitude | pickup_latitude | dropoff_longitude | dropoff_latitude | store_and_fwd_flag | trip_duration |\n|---:|:----------|------------:|:--------------------|:--------------------|------------------:|-------------------:|------------------:|--------------------:|-------------------:|:---------------------|----------------:|\n| 0 | id2875421 | 2 | 2016-03-14 17:24:55 | 2016-03-14 17:32:30 | 1 | -73.9822 | 40.7679 | -73.9646 | 40.7656 | N | 455 |\n| 1 | id2377394 | 1 | 2016-06-12 00:43:35 | 2016-06-12 00:54:38 | 1 | -73.9804 | 40.7386 | -73.9995 | 40.7312 | N | 663 |\n| 2 | id3858529 | 2 | 2016-01-19 11:35:24 | 2016-01-19 12:10:48 | 1 | -73.979 | 40.7639 | -74.0053 | 40.7101 | N | 2124 |\n| 3 | id3504673 | 2 | 2016-04-06 19:32:31 | 2016-04-06 19:39:40 | 1 | -74.01 | 40.72 | -74.0123 | 40.7067 | N | 429 |\n| 4 | id2181028 | 2 | 2016-03-26 13:30:55 | 2016-03-26 13:38:10 | 1 | -73.9731 | 40.7932 | -73.9729 | 40.7825 | N | 435 |\n\n| | id | vendor_id | pickup_datetime | passenger_count | pickup_longitude | pickup_latitude | dropoff_longitude | dropoff_latitude | store_and_fwd_flag |\n|---:|:----------|------------:|:--------------------|------------------:|-------------------:|------------------:|--------------------:|-------------------:|:---------------------|\n| 0 | id3004672 | 1 | 2016-06-30 23:59:58 | 1 | -73.9881 | 40.732 | -73.9902 | 40.7567 | N |\n| 1 | id3505355 | 1 | 2016-06-30 23:59:53 | 1 | -73.9642 | 40.68 | -73.9598 | 40.6554 | N |\n| 2 | id1217141 | 1 | 2016-06-30 23:59:47 | 1 | -73.9974 | 40.7376 | -73.9862 | 40.7295 | N |\n| 3 | id2150126 | 2 | 2016-06-30 23:59:41 | 1 | -73.9561 | 40.7719 | -73.9864 | 40.7305 | N |\n| 4 | id1598245 | 1 | 2016-06-30 23:59:33 | 1 | -73.9702 | 40.7615 | -73.9615 | 40.7559 | N |'}, template='\nYou are working with {num_dfs} pandas dataframes in Python named df1, df2, etc. You \nshould use the tools below to answer the question posed of you:\n\npython_repl_ast: A Python shell. Use this to execute python commands. Input should be a valid python command. When using this tool, sometimes output is abbreviated - make sure it does not look abbreviated before using it in your answer.\n\nUse the following format:\n\nQuestion: the input question you must answer\nThought: you should always think about what to do\nAction: the action to take, should be one of [python_repl_ast]\nAction Input: the input to the action\nObservation: the result of the action\n... (this Thought/Action/Action Input/Observation can repeat N times)\nThought: I now know the final answer\nFinal Answer: the final answer to the original input question\n\n\nThis is the result of `print(df.head())` for each dataframe:\n{dfs_head}\n\nBegin!\nQuestion: {input}\n{agent_scratchpad}'), llm=OpenAI(client=<openai.resources.completions.Completions object at 0x0000020D1E785C30>, async_client=<openai.resources.completions.AsyncCompletions object at 0x0000020D42864400>, temperature=0.0, openai_api_key='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', openai_proxy='')), output_parser=MRKLOutputParser(), allowed_tools=['python_repl_ast']), tools=[PythonAstREPLTool(locals={'df1': id vendor_id pickup_datetime dropoff_datetime \ 0 id2875421 2 2016-03-14 17:24:55 2016-03-14 17:32:30 1 id2377394 1 2016-06-12 00:43:35 2016-06-12 00:54:38 2 id3858529 2 2016-01-19 11:35:24 2016-01-19 12:10:48 3 id3504673 2 2016-04-06 19:32:31 2016-04-06 19:39:40 4 id2181028 2 2016-03-26 13:30:55 2016-03-26 13:38:10 ... ... ... ... ... 1458639 id2376096 2 2016-04-08 13:31:04 2016-04-08 13:44:02 1458640 id1049543 1 2016-01-10 07:35:15 2016-01-10 07:46:10 1458641 id2304944 2 2016-04-22 06:57:41 2016-04-22 07:10:25 1458642 id2714485 1 2016-01-05 15:56:26 2016-01-05 16:02:39 1458643 id1209952 1 2016-04-05 14:44:25 2016-04-05 14:47:43 passenger_count pickup_longitude pickup_latitude \ 0 1 -73.982155 40.767937 1 1 -73.980415 40.738564 2 1 -73.979027 40.763939 3 1 -74.010040 40.719971 4 1 -73.973053 40.793209 ... ... ... ... 1458639 4 -73.982201 40.745522 1458640 1 -74.000946 40.747379 1458641 1 -73.959129 40.768799 1458642 1 -73.982079 40.749062 1458643 1 -73.979538 40.781750 dropoff_longitude dropoff_latitude store_and_fwd_flag trip_duration 0 -73.964630 40.765602 N 455 1 -73.999481 40.731152 N 663 2 -74.005333 40.710087 N 2124 3 -74.012268 40.706718 N 429 4 -73.972923 40.782520 N 435 ... ... ... ... ... 1458639 -73.994911 40.740170 N 778 1458640 -73.970184 40.796547 N 655 1458641 -74.004433 40.707371 N 764 1458642 -73.974632 40.757107 N 373 1458643 -73.972809 40.790585 N 198 [1458644 rows x 11 columns], 'df2': id vendor_id pickup_datetime passenger_count \ 0 id3004672 1 2016-06-30 23:59:58 1 1 id3505355 1 2016-06-30 23:59:53 1 2 id1217141 1 2016-06-30 23:59:47 1 3 id2150126 2 2016-06-30 23:59:41 1 4 id1598245 1 2016-06-30 23:59:33 1 ... ... ... ... ... 625129 id3008929 1 2016-01-01 00:02:52 1 625130 id3700764 1 2016-01-01 00:01:52 1 625131 id2568735 1 2016-01-01 00:01:24 2 625132 id1384355 1 2016-01-01 00:00:28 1 625133 id0621643 2 2016-01-01 00:00:22 2 pickup_longitude pickup_latitude dropoff_longitude \ 0 -73.988129 40.732029 -73.990173 1 -73.964203 40.679993 -73.959808 2 -73.997437 40.737583 -73.986160 3 -73.956070 40.771900 -73.986427 4 -73.970215 40.761475 -73.961510 ... ... ... ... 625129 -74.003464 40.725105 -74.001251 625130 -74.006363 40.743782 -73.953407 625131 -73.972267 40.759865 -73.876602 625132 -73.976501 40.733562 -73.854263 625133 -73.981850 40.716881 -73.969330 dropoff_latitude store_and_fwd_flag 0 40.756680 N 1 40.655403 N 2 40.729523 N 3 40.730469 N 4 40.755890 N ... ... ... 625129 40.733643 N 625130 40.782467 N 625131 40.748665 N 625132 40.891788 N 625133 40.769379 N [625134 rows x 9 columns]})])
In [ ]:
agent.run("how many rows in the age column are different between the two dfs?")
> Entering new AgentExecutor chain... Thought: I need to compare the two dataframes Action: python_repl_ast Action Input: df1['id'].isin(df2['id']).value_counts() Observation: False 1458644 Name: id, dtype: int64 Thought: I now know the final answer Final Answer: 1458644 > Finished chain.
Out[ ]:
'1458644'