Retrieval-Augmented Generation (RAG)¶

Combining Vector Search with Knowledge Graph traversal for better context.

In [1]:

Copied!

import os
import shutil
import tempfile

import uni_db
import os
import shutil
import tempfile

import uni_db

In [2]:

Copied!





db_path = os.path.join(tempfile.gettempdir(), "rag_db")
if os.path.exists(db_path):
    shutil.rmtree(db_path)
db = uni_db.Database(db_path)
print(f"Opened database at {db_path}")
db_path = os.path.join(tempfile.gettempdir(), "rag_db")
if os.path.exists(db_path):
    shutil.rmtree(db_path)
db = uni_db.Database(db_path)
print(f"Opened database at {db_path}")

Opened database at /tmp/rag_db

1. Schema¶

Chunks of text with embeddings, linked to named Entities.

In [3]:

Copied!





db.create_label("Chunk")
db.create_label("Entity")
db.create_edge_type("MENTIONS", ["Chunk"], ["Entity"])

db.add_property("Chunk", "text", "string", False)
db.add_property("Chunk", "embedding", "vector:4", False)
db.create_vector_index("Chunk", "embedding", "cosine")
db.create_label("Chunk")
db.create_label("Entity")
db.create_edge_type("MENTIONS", ["Chunk"], ["Entity"])

db.add_property("Chunk", "text", "string", False)
db.add_property("Chunk", "embedding", "vector:4", False)
db.create_vector_index("Chunk", "embedding", "cosine")

2. Ingest Data¶

In [4]:

Copied!





c1_vec = [1.0, 0.0, 0.0, 0.0]
c2_vec = [0.9, 0.1, 0.0, 0.0]

c_vids = db.bulk_insert_vertices('Chunk', [
    {'text': 'Function verify() checks signatures.', 'embedding': c1_vec},
    {'text': 'Other text about verify.', 'embedding': c2_vec}
]) 
c1, c2 = c_vids

e_vids = db.bulk_insert_vertices('Entity', [{'name': 'verify', 'type': 'function'}])
e1 = e_vids[0]

db.bulk_insert_edges('MENTIONS', [(c1, e1, {}), (c2, e1, {})])
db.flush()
c1_vec = [1.0, 0.0, 0.0, 0.0]
c2_vec = [0.9, 0.1, 0.0, 0.0]

c_vids = db.bulk_insert_vertices('Chunk', [
    {'text': 'Function verify() checks signatures.', 'embedding': c1_vec},
    {'text': 'Other text about verify.', 'embedding': c2_vec}
]) 
c1, c2 = c_vids

e_vids = db.bulk_insert_vertices('Entity', [{'name': 'verify', 'type': 'function'}])
e1 = e_vids[0]

db.bulk_insert_edges('MENTIONS', [(c1, e1, {}), (c2, e1, {})])
db.flush()

3. Hybrid Retrieval¶

Find chunks related to a specific chunk via shared entities.

In [5]:

Copied!





query = "MATCH (c:Chunk)-[:MENTIONS]->(e:Entity)<-[:MENTIONS]-(related:Chunk) WHERE c._vid = $cid AND related._vid <> c._vid RETURN related.text as text"
results = db.query(query, {"cid": c1})
for r in results:
    print(f"Related text: {r['text']}")
query = "MATCH (c:Chunk)-[:MENTIONS]->(e:Entity)<-[:MENTIONS]-(related:Chunk) WHERE c._vid = $cid AND related._vid <> c._vid RETURN related.text as text"
results = db.query(query, {"cid": c1})
for r in results:
    print(f"Related text: {r['text']}")

Related text: Other text about verify.

DEBUG 2: DataFusion execution failed (falling back to execute_subplan): Internal error: Only intervals with the same data type are intersectable, lhs:UInt64, rhs:Int64.
This issue was likely caused by a bug in DataFusion's code. Please help us to resolve this by filing a bug report in our issue tracker: https://github.com/apache/datafusion/issues