The RNA Lab Navigator is a private, retrieval-augmented assistant designed for the RNA-biology lab. This document provides a detailed technical overview of the system's architecture, focusing on the RAG (Retrieval Augmented Generation) pipeline implementation.
Key Features:
Download architecture diagrams and technical specifications for presentations or documentation
Download Architecture Diagrams (PDF) Download Code Samples (ZIP)def chunk_document(text, chunk_size=400, overlap=100): """ Split document text into overlapping chunks. Args: text: Document text to chunk chunk_size: Target word count per chunk overlap: Word overlap between chunks Returns: List of text chunks """ words = text.split() chunks = [] for i in range(0, len(words), chunk_size - overlap): chunk = words[i:i + chunk_size] if len(chunk) < 50: # Skip tiny chunks at the end continue chunks.append(" ".join(chunk)) return chunks
def generate_embeddings(chunks, use_cache=True): """ Generate embeddings for text chunks with caching. Args: chunks: List of text chunks use_cache: Whether to use embedding cache Returns: List of embedding vectors """ embeddings = [] batch_size = 20 for i in range(0, len(chunks), batch_size): batch = chunks[i:i + batch_size] batch_embeddings = [] for chunk in batch: # Generate cache key from content chunk_hash = hashlib.sha256(chunk.encode()).hexdigest() if use_cache and redis_client.exists(f"emb:{chunk_hash}"): # Retrieve from cache embedding = json.loads(redis_client.get(f"emb:{chunk_hash}")) else: # Generate new embedding response = openai.Embedding.create( input=chunk, model="text-embedding-ada-002" ) embedding = response['data'][0]['embedding'] # Cache the embedding with 30-day TTL if use_cache: redis_client.setex( f"emb:{chunk_hash}", 60 * 60 * 24 * 30, # 30 day TTL json.dumps(embedding) ) batch_embeddings.append(embedding) embeddings.extend(batch_embeddings) return embeddings
class_obj = { "class": "Document", "vectorizer": "none", # We provide vectors directly "vectorIndexConfig": { "distance": "cosine", "ef": 256, "efConstruction": 256, "maxConnections": 64, }, "properties": [ {"name": "content", "dataType": ["text"]}, {"name": "doc_type", "dataType": ["text"], "indexFilterable": True}, {"name": "source_file", "dataType": ["text"], "indexFilterable": True}, {"name": "chunk_index", "dataType": ["int"]}, {"name": "author", "dataType": ["text"], "indexFilterable": True}, {"name": "year", "dataType": ["int"], "indexFilterable": True}, {"name": "chapter", "dataType": ["text"]}, {"name": "department", "dataType": ["text"]}, {"name": "category", "dataType": ["text"], "indexFilterable": True}, {"name": "last_updated", "dataType": ["date"]}, {"name": "reagents", "dataType": ["text[]"]}, {"name": "authors", "dataType": ["text[]"]}, {"name": "publication_date", "dataType": ["date"]}, {"name": "journal", "dataType": ["text"], "indexFilterable": True}, {"name": "doi", "dataType": ["text"]}, ], "moduleConfig": { "text2vec-contextionary": { "skip": True # Skip built-in vectorization }, "text2vec-transformers": { "skip": True # Skip built-in vectorization } } }
def preprocess_query(query): """ Clean and normalize the query text """ # Remove special characters query = re.sub(r'[^\w\s]', ' ', query) # Normalize whitespace query = re.sub(r'\s+', ' ', query).strip() return query def expand_query(query): """ Expand query with related terms """ # Extract key entities entities = extract_entities(query) # Add related terms based on domain knowledge expanded_terms = [] for entity in entities: if entity in domain_knowledge: expanded_terms.extend(domain_knowledge[entity]['synonyms']) # Combine original query with expanded terms expanded_query = query if expanded_terms: expanded_query = f"{query} {' '.join(expanded_terms)}" return expanded_query
def hybrid_search(query, filters=None, limit=10): """ Perform hybrid vector + keyword search """ # Generate embedding for query query_embedding = generate_embedding(query) # Prepare search parameters search_params = { "near_vector": { "vector": query_embedding, }, "limit": limit, } # Add BM25 for hybrid search search_params["hybrid"] = { "query": query, "alpha": 0.75, # Weight for vector search vs keyword search } # Add filters if specified if filters: search_params["where"] = filters # Execute search results = weaviate_client.query.get( "Document", ["content", "doc_type", "source_file", "chunk_index", "author", "year", "chapter", "category"] ).with_additional(["distance", "score"]).with_near_vector( search_params["near_vector"] ).with_hybrid( search_params["hybrid"] ).with_limit(limit).do() return results["data"]["Get"]["Document"]
def rerank_results(query, results, top_k=3): """ Rerank search results using cross-encoder """ # If we have no results, return empty list if not results: return [] # Prepare pairs for cross-encoder pairs = [(query, result["content"]) for result in results] # Get relevance scores relevance_scores = cross_encoder.predict(pairs) # Add scores to results for i, result in enumerate(results): result["relevance_score"] = float(relevance_scores[i]) # Sort by relevance score reranked_results = sorted(results, key=lambda x: x["relevance_score"], reverse=True) # Apply confidence threshold filtered_results = [r for r in reranked_results if r["relevance_score"] >= 0.45] # Return top k results return filtered_results[:top_k]
def prepare_context(results): """ Format search results into LLM-ready context with citations """ context_parts = [] source_map = {} for i, result in enumerate(results): # Create citation token citation = f"[{i+1}]" source_id = f"source_{i+1}" # Add to source map for later reference source_map[source_id] = { "doc_type": result.get("doc_type", ""), "title": result.get("source_file", "").split("/")[-1], "author": result.get("author", ""), "year": result.get("year", ""), "chapter": result.get("chapter", ""), "score": result.get("relevance_score", 0) } # Format the context with citation context_part = f"{result['content']} {citation}" context_parts.append(context_part) # Combine all contexts context = "\n\n".join(context_parts) return context, source_map
def construct_prompt(query, context, source_map): """ Construct the LLM prompt with system message, query, and context """ system_message = """ You are RNA Lab Navigator, a specialized assistant for an RNA biology research lab. Answer only from the provided sources; if unsure, say 'I don't know.' Important rules: 1. Include citations for all factual statements using the [X] format 2. Citations must appear at the end of the sentence containing the information 3. Only reference information explicitly stated in the provided context 4. Maintain scientific accuracy and precision 5. If multiple sources confirm the same information, cite all of them 6. If the query cannot be answered from the provided context, say 'I don't know' 7. Never make up information or citations """ # Construct the full prompt prompt = f""" Context: {context} Sources: {json.dumps(source_map, indent=2)} Question: {query} """ return { "system": system_message, "prompt": prompt }
def generate_answer(prompt_data, stream=True): """ Generate answer from LLM using the constructed prompt """ response = openai.ChatCompletion.create( model="gpt-4o", messages=[ {"role": "system", "content": prompt_data["system"]}, {"role": "user", "content": prompt_data["prompt"]} ], temperature=0.2, max_tokens=1000, stream=stream ) # Process streaming response if stream: collected_chunks = [] for chunk in response: collected_chunks.append(chunk) # Yield chunk for streaming to client yield chunk # Process the full response after streaming full_response = "".join([c.choices[0].delta.get("content", "") for c in collected_chunks]) else: # Process the full response directly full_response = response.choices[0].message.content return full_response def validate_citations(answer, source_map): """ Validate all citations in the answer """ citation_pattern = r'\[(\d+)\]' citations = re.findall(citation_pattern, answer) valid_citations = [] for citation in citations: source_id = f"source_{citation}" if source_id in source_map: valid_citations.append(source_id) # If no valid citations found, mark low confidence if not valid_citations: return False, 0.0 # Calculate overall confidence based on source relevance scores confidence = sum(source_map[source_id]["score"] for source_id in valid_citations) / len(valid_citations) return True, confidence
Frequently asked questions are cached to eliminate redundant processing. Cache entries include:
SHA-256 hash-based caching system for vector embeddings:
MiniLM cross-encoder model is kept in memory for fast inference:
Real-time streaming of LLM response tokens:
Front-end processing of citation tokens:
Optimized hybrid search parameters:
To optimize cost and performance, the system implements intelligent model routing based on query complexity:
def analyze_query_complexity(query, search_results): """ Analyze query complexity to determine appropriate LLM """ # Simple properties token_count = len(query.split()) query_length = len(query) # Check for technical terms technical_terms_count = sum(1 for term in query.split() if term.lower() in TECHNICAL_TERMS) technical_density = technical_terms_count / max(1, token_count) # Check number of intents intents = identify_intents(query) multi_intent = len(intents) > 1 # Check confidence of top result top_confidence = search_results[0]["relevance_score"] if search_results else 0 # Determine if this needs the more powerful model needs_powerful_model = ( multi_intent or technical_density > 0.3 or token_count > 250 or top_confidence < 0.8 ) # Select appropriate model model = "gpt-4o" if needs_powerful_model else "gpt-3.5-turbo" return { "model": model, "complexity_score": 0.2 * multi_intent + 0.5 * technical_density + 0.1 * min(1, token_count/300) + 0.2 * (1 - top_confidence) }
The following features are planned for future development cycles:
The following optimizations are planned to further improve system performance:
RNA Lab Navigator Technical Documentation