ExampleHostedChatBot / ObtainChatData.py
AjithKSenthil's picture
Upload 5 files
d2e169c verified
raw
history blame contribute delete
998 Bytes
# ObtainChatData.py
import pandas as pd
# Load & inspect dataset
input_datapath = "../data/chatbotdata.csv"
output_datapath = "../data/processed_chat_data.csv"
df = pd.read_csv(input_datapath)
print("Initial Data Loaded. Here's a preview:")
print(df.head(2))
# Data Preprocessing
df = df.dropna()
print(f"Data after cleaning: {len(df)} entries")
# Combine all transcripts from the same user into a single transcript
df_combined = df.groupby('user_id')['transcript'].apply(' '.join).reset_index()
# Save the processed data
df_combined.to_csv(output_datapath, index=False)
print(f"Processed data saved to {output_datapath}")
# Note to Users:
# - Ensure that your data file path is correctly specified in 'input_datapath'.
# - The script currently includes a basic data cleaning step (dropping missing values). Depending on your data, you may need to add more preprocessing steps.
# - The processed data is saved in CSV format. Make sure to use this processed file in subsequent analysis scripts.