Retrieval-Augmented Generation (RAG)¶
Combining Vector Search with Knowledge Graph traversal for better context.
In [1]:
Copied!
import os
import shutil
import tempfile
import uni_db
import os
import shutil
import tempfile
import uni_db
In [2]:
Copied!
db_path = os.path.join(tempfile.gettempdir(), "rag_db")
if os.path.exists(db_path):
shutil.rmtree(db_path)
db = uni_db.Database(db_path)
print(f"Opened database at {db_path}")
db_path = os.path.join(tempfile.gettempdir(), "rag_db")
if os.path.exists(db_path):
shutil.rmtree(db_path)
db = uni_db.Database(db_path)
print(f"Opened database at {db_path}")
Opened database at /tmp/rag_db
1. Schema¶
Chunks of text with embeddings, linked to named Entities.
In [3]:
Copied!
db.create_label("Chunk")
db.create_label("Entity")
db.create_edge_type("MENTIONS", ["Chunk"], ["Entity"])
db.add_property("Chunk", "text", "string", False)
db.add_property("Chunk", "embedding", "vector:4", False)
db.create_vector_index("Chunk", "embedding", "cosine")
db.create_label("Chunk")
db.create_label("Entity")
db.create_edge_type("MENTIONS", ["Chunk"], ["Entity"])
db.add_property("Chunk", "text", "string", False)
db.add_property("Chunk", "embedding", "vector:4", False)
db.create_vector_index("Chunk", "embedding", "cosine")
2. Ingest Data¶
In [4]:
Copied!
c1_vec = [1.0, 0.0, 0.0, 0.0]
c2_vec = [0.9, 0.1, 0.0, 0.0]
c_vids = db.bulk_insert_vertices('Chunk', [
{'text': 'Function verify() checks signatures.', 'embedding': c1_vec},
{'text': 'Other text about verify.', 'embedding': c2_vec}
])
c1, c2 = c_vids
e_vids = db.bulk_insert_vertices('Entity', [{'name': 'verify', 'type': 'function'}])
e1 = e_vids[0]
db.bulk_insert_edges('MENTIONS', [(c1, e1, {}), (c2, e1, {})])
db.flush()
c1_vec = [1.0, 0.0, 0.0, 0.0]
c2_vec = [0.9, 0.1, 0.0, 0.0]
c_vids = db.bulk_insert_vertices('Chunk', [
{'text': 'Function verify() checks signatures.', 'embedding': c1_vec},
{'text': 'Other text about verify.', 'embedding': c2_vec}
])
c1, c2 = c_vids
e_vids = db.bulk_insert_vertices('Entity', [{'name': 'verify', 'type': 'function'}])
e1 = e_vids[0]
db.bulk_insert_edges('MENTIONS', [(c1, e1, {}), (c2, e1, {})])
db.flush()
3. Hybrid Retrieval¶
Find chunks related to a specific chunk via shared entities.
In [5]:
Copied!
query = "MATCH (c:Chunk)-[:MENTIONS]->(e:Entity)<-[:MENTIONS]-(related:Chunk) WHERE c._vid = $cid AND related._vid <> c._vid RETURN related.text as text"
results = db.query(query, {"cid": c1})
for r in results:
print(f"Related text: {r['text']}")
query = "MATCH (c:Chunk)-[:MENTIONS]->(e:Entity)<-[:MENTIONS]-(related:Chunk) WHERE c._vid = $cid AND related._vid <> c._vid RETURN related.text as text"
results = db.query(query, {"cid": c1})
for r in results:
print(f"Related text: {r['text']}")
Related text: Other text about verify.
DEBUG 2: DataFusion execution failed (falling back to execute_subplan): Internal error: Only intervals with the same data type are intersectable, lhs:UInt64, rhs:Int64. This issue was likely caused by a bug in DataFusion's code. Please help us to resolve this by filing a bug report in our issue tracker: https://github.com/apache/datafusion/issues