File size: 1,298 Bytes
d2e169c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
import pandas as pd
import openai
import tiktoken
import os
import config
from openai import OpenAI
from dotenv import load_dotenv
load_dotenv(override=True)

client = OpenAI(
  api_key=os.getenv("OPENAI_API_KEY")
)
# Set your OpenAI API key

# Embedding model parameters
embedding_model = "text-embedding-ada-002"
embedding_encoding = "cl100k_base"
max_tokens = 8000

# Function to get embeddings
def get_embedding(text, model="text-embedding-3-small"):
   text = text.replace("\n", " ")
   return client.embeddings.create(input = [text], model=model).data[0].embedding

# Load preprocessed chat transcript data
input_datapath = "../data/processed_chat_data.csv"
output_datapath = "../data/chat_transcripts_with_embeddings.csv"
df = pd.read_csv(input_datapath)

# Ensure your chat transcripts are within the token limit for embedding
encoding = tiktoken.get_encoding(embedding_encoding)
df["n_tokens"] = df["transcript"].apply(lambda x: len(encoding.encode(x)))
df = df[df["n_tokens"] <= max_tokens]

# Extract embeddings for each chat transcript
print("Extracting embeddings...")
df["embedding"] = df["transcript"].apply(lambda x: get_embedding(x, embedding_model))

# Save the data with embeddings
df.to_csv(output_datapath, index=False)
print(f"Data with embeddings saved to {output_datapath}")