The document ingestion pipeline is a critical component of the RNA Lab Navigator, responsible for processing various types of documents (protocols, theses, papers) and preparing them for efficient retrieval. This document provides a detailed technical overview of the ingestion process, focusing on the specialized handling of different document types and optimization strategies.
Key Features:
The ingestion pipeline employs specialized handling for different document types:
The extraction module uses a multi-tool approach with fallback mechanisms:
The tool selection is automatic based on document analysis:
def extract_text(pdf_path):
"""
Extract text from PDF using appropriate tools
"""
# Analyze PDF to determine best extraction method
extraction_method = analyze_pdf_structure(pdf_path)
if extraction_method == "pdfplumber":
return extract_with_pdfplumber(pdf_path)
elif extraction_method == "pymupdf":
return extract_with_pymupdf(pdf_path)
elif extraction_method == "ocr":
return extract_with_ocr(pdf_path)
else:
# Fallback to combined approach
return extract_combined(pdf_path)
def analyze_pdf_structure(pdf_path):
"""
Determine the appropriate extraction method
based on PDF structure analysis
"""
with fitz.open(pdf_path) as doc:
# Check if document has selectable text
has_text = False
for page in doc:
if page.get_text():
has_text = True
break
# If no selectable text, use OCR
if not has_text:
return "ocr"
# Check for complex layouts (tables, multi-column)
has_complex_layout = False
sample_page = doc[0]
blocks = sample_page.get_text("blocks")
if len(blocks) > 10: # Heuristic for complex layout
has_complex_layout = True
# Return appropriate method
if has_complex_layout:
return "pdfplumber"
else:
return "pymupdf"
The chunking system balances several factors:
Key features include:
def chunk_document(text, doc_type, metadata=None):
"""
Intelligently chunk document based on type and content
"""
if doc_type == "thesis":
# For theses, first split by chapters
chapter_pattern = r'CHAPTER\s+\d+[\s\-:]*([^\n]+)'
chapters = re.split(chapter_pattern, text)
chunks = []
for i, chapter in enumerate(chapters):
if i % 2 == 1: # Chapter titles are at odd indices
chapter_title = chapter.strip()
chapter_content = chapters[i+1] if i+1 < len(chapters) else ""
# For each chapter, create word chunks
chapter_chunks = create_word_chunks(
chapter_content,
chunk_size=400,
overlap=100
)
# Add chapter metadata to each chunk
for j, chunk in enumerate(chapter_chunks):
chunk_metadata = metadata.copy() if metadata else {}
chunk_metadata.update({
"chapter": chapter_title,
"chunk_index": j,
"chapter_index": i // 2
})
chunks.append({
"content": chunk,
"metadata": chunk_metadata
})
return chunks
elif doc_type == "protocol":
# For protocols, preserve reagent sections
sections = split_by_protocol_sections(text)
chunks = []
for section_name, section_content in sections:
# Create word chunks within each section
section_chunks = create_word_chunks(
section_content,
chunk_size=400,
overlap=100
)
# Add section metadata to each chunk
for j, chunk in enumerate(section_chunks):
chunk_metadata = metadata.copy() if metadata else {}
chunk_metadata.update({
"section": section_name,
"chunk_index": j
})
chunks.append({
"content": chunk,
"metadata": chunk_metadata
})
return chunks
else: # Default for papers and other types
standard_chunks = create_word_chunks(
text,
chunk_size=400,
overlap=100
)
return [
{
"content": chunk,
"metadata": {
**(metadata or {}),
"chunk_index": i
}
}
for i, chunk in enumerate(standard_chunks)
]
The metadata extraction module employs a combination of rule-based and ML approaches:
The embedding generation system prioritizes quality and efficiency:
Optimization techniques include:
def generate_embedding_batch(chunks, use_cache=True):
"""
Generate embeddings for a batch of chunks with caching
"""
# Check cache for existing embeddings
cache_hits = []
chunks_to_embed = []
chunk_hashes = []
for chunk in chunks:
chunk_text = chunk["content"]
chunk_hash = hashlib.sha256(chunk_text.encode()).hexdigest()
chunk_hashes.append(chunk_hash)
if use_cache and redis_client.exists(f"emb:{chunk_hash}"):
# Retrieve from cache
embedding = json.loads(redis_client.get(f"emb:{chunk_hash}"))
cache_hits.append({
"chunk": chunk,
"embedding": embedding,
"source": "cache"
})
else:
chunks_to_embed.append(chunk)
# If all chunks were in cache, return early
if not chunks_to_embed:
return cache_hits
# Generate embeddings for remaining chunks
try:
response = openai.Embedding.create(
input=[c["content"] for c in chunks_to_embed],
model="text-embedding-ada-002"
)
# Process response and update cache
api_results = []
for i, chunk in enumerate(chunks_to_embed):
embedding = response["data"][i]["embedding"]
chunk_hash = chunk_hashes[len(cache_hits) + i]
# Cache the embedding
if use_cache:
redis_client.setex(
f"emb:{chunk_hash}",
60 * 60 * 24 * 30, # 30 day TTL
json.dumps(embedding)
)
api_results.append({
"chunk": chunk,
"embedding": embedding,
"source": "api"
})
# Combine cache hits and API results
return cache_hits + api_results
except Exception as e:
# Fallback to local model on API failure
logger.warning(f"OpenAI API failed, falling back to local model: {e}")
local_model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = local_model.encode(
[c["content"] for c in chunks_to_embed],
batch_size=32,
show_progress_bar=False
)
fallback_results = []
for i, chunk in enumerate(chunks_to_embed):
embedding = embeddings[i].tolist()
fallback_results.append({
"chunk": chunk,
"embedding": embedding,
"source": "local_model"
})
return cache_hits + fallback_results
The Weaviate configuration is optimized for RNA biology domain:
Schema design includes:
class_obj = {
"class": "Document",
"vectorizer": "none", # We provide vectors directly
"vectorIndexConfig": {
"distance": "cosine",
"ef": 256,
"efConstruction": 256,
"maxConnections": 64,
},
"moduleConfig": {
"text2vec-contextionary": {
"skip": True # Skip built-in vectorization
},
"text2vec-transformers": {
"skip": True # Skip built-in vectorization
}
},
"properties": [
{"name": "content", "dataType": ["text"]},
{"name": "doc_type", "dataType": ["text"], "indexFilterable": True},
{"name": "source_file", "dataType": ["text"], "indexFilterable": True},
{"name": "chunk_index", "dataType": ["int"]},
{"name": "word_count", "dataType": ["int"]},
{"name": "ingestion_date", "dataType": ["date"]},
# Thesis-specific properties
{"name": "author", "dataType": ["text"], "indexFilterable": True},
{"name": "year", "dataType": ["int"], "indexFilterable": True},
{"name": "department", "dataType": ["text"], "indexFilterable": True},
{"name": "institution", "dataType": ["text"], "indexFilterable": True},
{"name": "advisor", "dataType": ["text"]},
{"name": "chapter", "dataType": ["text"], "indexFilterable": True},
# Protocol-specific properties
{"name": "version", "dataType": ["string"]},
{"name": "creation_date", "dataType": ["date"]},
{"name": "update_date", "dataType": ["date"]},
{"name": "category", "dataType": ["text"], "indexFilterable": True},
{"name": "reagents", "dataType": ["text[]"]},
# Paper-specific properties
{"name": "authors", "dataType": ["text[]"]},
{"name": "journal", "dataType": ["text"], "indexFilterable": True},
{"name": "publication_date", "dataType": ["date"]},
{"name": "doi", "dataType": ["text"]},
{"name": "keywords", "dataType": ["text[]"], "indexFilterable": True},
# Cross-references
{"name": "references", "dataType": ["Document[]"]}
]
}
Storage requirements: ~2 KB per chunk metadata + ~12 KB per vector embedding = ~14 KB per chunk
Automated extraction and embedding of figures from documents:
This enhancement will enable the system to include relevant figures in responses.
Specialized extraction of reagent information from protocols:
This will enable precise reagent lookup and inventory integration.
Enhanced document structure preservation:
This will improve navigation within large documents like theses.
Efficient handling of document updates:
This will optimize processing for frequently updated protocols.
RNA Lab Navigator Technical Documentation