forked from chrislatimer/astra-simple-vector-search
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathastra_insert.py
49 lines (36 loc) · 1.38 KB
/
astra_insert.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
import os
import uuid
from astrapy import DataAPIClient
from doc_chunker import chunk_file
from embedding_create import create_embeddings
# Fetching necessary environment variables for AstraDB configuration
ASTRA_DB_APPLICATION_TOKEN = os.environ["ASTRA_DB_APPLICATION_TOKEN"]
ASTRA_DB_API_ENDPOINT = os.environ["ASTRA_DB_API_ENDPOINT"]
ASTRA_DB_KEYSPACE = os.environ.get("ASTRA_DB_KEYSPACE")
COLLECTION_NAME = "town_content"
# Initialize connection to Astra DB
client = DataAPIClient()
db = client.get_database(
ASTRA_DB_API_ENDPOINT,
token=ASTRA_DB_APPLICATION_TOKEN,
keyspace=ASTRA_DB_KEYSPACE,
)
# Chunk the sample file into paragraphs
paragraphs = chunk_file("./towns/shadowfen.txt")
# Create embeddings for each paragraph
embeddings_list = create_embeddings(paragraphs)
documents = [] # Initialize an empty list to hold document dictionaries
for embeddings, paragraph in zip(embeddings_list, paragraphs):
# Create a dictionary for the current document
document = {
"_id": uuid.uuid4(),
"text": paragraph,
"$vector": embeddings.tolist(),
}
# Append the document dictionary to the list
documents.append(document)
# Get (an astrapy reference to) the db collection
collection = db.get_collection(name=COLLECTION_NAME)
# Insert the documents
res = collection.insert_many(documents=documents)
print(f"Inserted {len(res.inserted_ids)} chunks.")