The RNA Lab Navigator is a private, retrieval-augmented assistant designed for the RNA-biology lab. This document provides a detailed technical overview of the system's architecture, focusing on the RAG (Retrieval Augmented Generation) pipeline implementation.
Key Features:
Download architecture diagrams and technical specifications for presentations or documentation
Download Architecture Diagrams (PDF) Download Code Samples (ZIP)def chunk_document(text, chunk_size=400, overlap=100):
"""
Split document text into overlapping chunks.
Args:
text: Document text to chunk
chunk_size: Target word count per chunk
overlap: Word overlap between chunks
Returns:
List of text chunks
"""
words = text.split()
chunks = []
for i in range(0, len(words), chunk_size - overlap):
chunk = words[i:i + chunk_size]
if len(chunk) < 50: # Skip tiny chunks at the end
continue
chunks.append(" ".join(chunk))
return chunks
def generate_embeddings(chunks, use_cache=True):
"""
Generate embeddings for text chunks with caching.
Args:
chunks: List of text chunks
use_cache: Whether to use embedding cache
Returns:
List of embedding vectors
"""
embeddings = []
batch_size = 20
for i in range(0, len(chunks), batch_size):
batch = chunks[i:i + batch_size]
batch_embeddings = []
for chunk in batch:
# Generate cache key from content
chunk_hash = hashlib.sha256(chunk.encode()).hexdigest()
if use_cache and redis_client.exists(f"emb:{chunk_hash}"):
# Retrieve from cache
embedding = json.loads(redis_client.get(f"emb:{chunk_hash}"))
else:
# Generate new embedding
response = openai.Embedding.create(
input=chunk,
model="text-embedding-ada-002"
)
embedding = response['data'][0]['embedding']
# Cache the embedding with 30-day TTL
if use_cache:
redis_client.setex(
f"emb:{chunk_hash}",
60 * 60 * 24 * 30, # 30 day TTL
json.dumps(embedding)
)
batch_embeddings.append(embedding)
embeddings.extend(batch_embeddings)
return embeddings
class_obj = {
"class": "Document",
"vectorizer": "none", # We provide vectors directly
"vectorIndexConfig": {
"distance": "cosine",
"ef": 256,
"efConstruction": 256,
"maxConnections": 64,
},
"properties": [
{"name": "content", "dataType": ["text"]},
{"name": "doc_type", "dataType": ["text"], "indexFilterable": True},
{"name": "source_file", "dataType": ["text"], "indexFilterable": True},
{"name": "chunk_index", "dataType": ["int"]},
{"name": "author", "dataType": ["text"], "indexFilterable": True},
{"name": "year", "dataType": ["int"], "indexFilterable": True},
{"name": "chapter", "dataType": ["text"]},
{"name": "department", "dataType": ["text"]},
{"name": "category", "dataType": ["text"], "indexFilterable": True},
{"name": "last_updated", "dataType": ["date"]},
{"name": "reagents", "dataType": ["text[]"]},
{"name": "authors", "dataType": ["text[]"]},
{"name": "publication_date", "dataType": ["date"]},
{"name": "journal", "dataType": ["text"], "indexFilterable": True},
{"name": "doi", "dataType": ["text"]},
],
"moduleConfig": {
"text2vec-contextionary": {
"skip": True # Skip built-in vectorization
},
"text2vec-transformers": {
"skip": True # Skip built-in vectorization
}
}
}
def preprocess_query(query):
"""
Clean and normalize the query text
"""
# Remove special characters
query = re.sub(r'[^\w\s]', ' ', query)
# Normalize whitespace
query = re.sub(r'\s+', ' ', query).strip()
return query
def expand_query(query):
"""
Expand query with related terms
"""
# Extract key entities
entities = extract_entities(query)
# Add related terms based on domain knowledge
expanded_terms = []
for entity in entities:
if entity in domain_knowledge:
expanded_terms.extend(domain_knowledge[entity]['synonyms'])
# Combine original query with expanded terms
expanded_query = query
if expanded_terms:
expanded_query = f"{query} {' '.join(expanded_terms)}"
return expanded_query
def hybrid_search(query, filters=None, limit=10):
"""
Perform hybrid vector + keyword search
"""
# Generate embedding for query
query_embedding = generate_embedding(query)
# Prepare search parameters
search_params = {
"near_vector": {
"vector": query_embedding,
},
"limit": limit,
}
# Add BM25 for hybrid search
search_params["hybrid"] = {
"query": query,
"alpha": 0.75, # Weight for vector search vs keyword search
}
# Add filters if specified
if filters:
search_params["where"] = filters
# Execute search
results = weaviate_client.query.get(
"Document", ["content", "doc_type", "source_file", "chunk_index",
"author", "year", "chapter", "category"]
).with_additional(["distance", "score"]).with_near_vector(
search_params["near_vector"]
).with_hybrid(
search_params["hybrid"]
).with_limit(limit).do()
return results["data"]["Get"]["Document"]
def rerank_results(query, results, top_k=3):
"""
Rerank search results using cross-encoder
"""
# If we have no results, return empty list
if not results:
return []
# Prepare pairs for cross-encoder
pairs = [(query, result["content"]) for result in results]
# Get relevance scores
relevance_scores = cross_encoder.predict(pairs)
# Add scores to results
for i, result in enumerate(results):
result["relevance_score"] = float(relevance_scores[i])
# Sort by relevance score
reranked_results = sorted(results, key=lambda x: x["relevance_score"], reverse=True)
# Apply confidence threshold
filtered_results = [r for r in reranked_results if r["relevance_score"] >= 0.45]
# Return top k results
return filtered_results[:top_k]
def prepare_context(results):
"""
Format search results into LLM-ready context with citations
"""
context_parts = []
source_map = {}
for i, result in enumerate(results):
# Create citation token
citation = f"[{i+1}]"
source_id = f"source_{i+1}"
# Add to source map for later reference
source_map[source_id] = {
"doc_type": result.get("doc_type", ""),
"title": result.get("source_file", "").split("/")[-1],
"author": result.get("author", ""),
"year": result.get("year", ""),
"chapter": result.get("chapter", ""),
"score": result.get("relevance_score", 0)
}
# Format the context with citation
context_part = f"{result['content']} {citation}"
context_parts.append(context_part)
# Combine all contexts
context = "\n\n".join(context_parts)
return context, source_map
def construct_prompt(query, context, source_map):
"""
Construct the LLM prompt with system message, query, and context
"""
system_message = """
You are RNA Lab Navigator, a specialized assistant for an RNA biology research lab.
Answer only from the provided sources; if unsure, say 'I don't know.'
Important rules:
1. Include citations for all factual statements using the [X] format
2. Citations must appear at the end of the sentence containing the information
3. Only reference information explicitly stated in the provided context
4. Maintain scientific accuracy and precision
5. If multiple sources confirm the same information, cite all of them
6. If the query cannot be answered from the provided context, say 'I don't know'
7. Never make up information or citations
"""
# Construct the full prompt
prompt = f"""
Context:
{context}
Sources:
{json.dumps(source_map, indent=2)}
Question:
{query}
"""
return {
"system": system_message,
"prompt": prompt
}
def generate_answer(prompt_data, stream=True):
"""
Generate answer from LLM using the constructed prompt
"""
response = openai.ChatCompletion.create(
model="gpt-4o",
messages=[
{"role": "system", "content": prompt_data["system"]},
{"role": "user", "content": prompt_data["prompt"]}
],
temperature=0.2,
max_tokens=1000,
stream=stream
)
# Process streaming response
if stream:
collected_chunks = []
for chunk in response:
collected_chunks.append(chunk)
# Yield chunk for streaming to client
yield chunk
# Process the full response after streaming
full_response = "".join([c.choices[0].delta.get("content", "") for c in collected_chunks])
else:
# Process the full response directly
full_response = response.choices[0].message.content
return full_response
def validate_citations(answer, source_map):
"""
Validate all citations in the answer
"""
citation_pattern = r'\[(\d+)\]'
citations = re.findall(citation_pattern, answer)
valid_citations = []
for citation in citations:
source_id = f"source_{citation}"
if source_id in source_map:
valid_citations.append(source_id)
# If no valid citations found, mark low confidence
if not valid_citations:
return False, 0.0
# Calculate overall confidence based on source relevance scores
confidence = sum(source_map[source_id]["score"] for source_id in valid_citations) / len(valid_citations)
return True, confidence
Frequently asked questions are cached to eliminate redundant processing. Cache entries include:
SHA-256 hash-based caching system for vector embeddings:
MiniLM cross-encoder model is kept in memory for fast inference:
Real-time streaming of LLM response tokens:
Front-end processing of citation tokens:
Optimized hybrid search parameters:
To optimize cost and performance, the system implements intelligent model routing based on query complexity:
def analyze_query_complexity(query, search_results):
"""
Analyze query complexity to determine appropriate LLM
"""
# Simple properties
token_count = len(query.split())
query_length = len(query)
# Check for technical terms
technical_terms_count = sum(1 for term in query.split() if term.lower() in TECHNICAL_TERMS)
technical_density = technical_terms_count / max(1, token_count)
# Check number of intents
intents = identify_intents(query)
multi_intent = len(intents) > 1
# Check confidence of top result
top_confidence = search_results[0]["relevance_score"] if search_results else 0
# Determine if this needs the more powerful model
needs_powerful_model = (
multi_intent or
technical_density > 0.3 or
token_count > 250 or
top_confidence < 0.8
)
# Select appropriate model
model = "gpt-4o" if needs_powerful_model else "gpt-3.5-turbo"
return {
"model": model,
"complexity_score": 0.2 * multi_intent + 0.5 * technical_density +
0.1 * min(1, token_count/300) + 0.2 * (1 - top_confidence)
}
The following features are planned for future development cycles:
The following optimizations are planned to further improve system performance:
RNA Lab Navigator Technical Documentation