Testing and Evaluation - Validate LangChain Applications β
Master comprehensive testing strategies for LangChain applications including unit tests, integration tests, performance benchmarks, and quality evaluation metrics
π§ͺ Testing Strategy Overview β
Testing LangChain applications requires specialized approaches due to their non-deterministic nature, external API dependencies, and complex reasoning patterns. This guide covers comprehensive testing methodologies for production systems.
π― Testing Pyramid for LangChain β
text
π§ͺ LANGCHAIN TESTING PYRAMID π§ͺ
(Comprehensive test strategy)
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
β E2E & USER TESTS β
β βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ β
β β β’ User Journey Tests β’ A/B Testing β’ Canary Releases β β
β β β’ Performance Tests β’ Load Testing β’ Chaos Engineering β β
β βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ β
βββββββββββββββββββββββ¬ββββββββββββββββββββββββββββββββββββββββββββ
β
βββββββββββββββββββββββΌββββββββββββββββββββββββββββββββββββββββββββ
β INTEGRATION TESTS β
β βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ β
β β β’ Chain Testing β’ Agent Testing β’ RAG Testing β β
β β β’ API Integration β’ Memory Testing β’ Tool Integration β β
β βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ β
βββββββββββββββββββββββ¬ββββββββββββββββββββββββββββββββββββββββββββ
β
βββββββββββββββββββββββΌββββββββββββββββββββββββββββββββββββββββββββ
β UNIT TESTS β
β βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ β
β β β’ Prompt Testing β’ Output Parsing β’ Component Tests β β
β β β’ Mock LLM Tests β’ Utility Tests β’ Error Handling β β
β βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ β
βββββββββββββββββββββββ¬ββββββββββββββββββββββββββββββββββββββββββββ
β
βββββββββββββββββββββββΌββββββββββββββββββββββββββββββββββββββββββββ
β QUALITY EVALUATION β
β βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ β
β β β’ Semantic Similarity β’ Factual Accuracy β’ Response Qualityβ β
β β β’ Hallucination Detection β’ Bias Testing β’ Safety Checks β β
β βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ β
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββπ§ Unit Testing Framework β
π― LangChain-Specific Unit Tests β
python
import pytest
from unittest.mock import Mock, patch, MagicMock
from typing import List, Dict, Any, Optional
import json
import asyncio
from datetime import datetime
# LangChain imports
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser, PydanticOutputParser
from langchain_core.messages import HumanMessage, AIMessage
from langchain.chains import LLMChain
from langchain.memory import ConversationBufferMemory
from langchain_core.documents import Document
# Testing utilities
class MockLLM:
"""Mock LLM for testing"""
def __init__(self, responses: List[str] = None):
self.responses = responses or ["Mock response"]
self.call_count = 0
self.call_history = []
def invoke(self, input_text: str) -> str:
"""Mock invoke method"""
self.call_history.append(input_text)
response = self.responses[self.call_count % len(self.responses)]
self.call_count += 1
return response
async def ainvoke(self, input_text: str) -> str:
"""Mock async invoke method"""
return self.invoke(input_text)
def __call__(self, input_text: str) -> str:
return self.invoke(input_text)
class MockVectorStore:
"""Mock vector store for testing"""
def __init__(self, documents: List[Document] = None):
self.documents = documents or []
self.search_history = []
def similarity_search(self, query: str, k: int = 4) -> List[Document]:
"""Mock similarity search"""
self.search_history.append({"query": query, "k": k})
return self.documents[:k]
def similarity_search_with_score(self, query: str, k: int = 4) -> List[tuple]:
"""Mock similarity search with scores"""
docs = self.similarity_search(query, k)
return [(doc, 0.8 - i * 0.1) for i, doc in enumerate(docs)]
class LangChainTestSuite:
"""Comprehensive test suite for LangChain components"""
@pytest.fixture
def mock_llm(self):
"""Fixture for mock LLM"""
return MockLLM([
"This is a test response",
"Another test response",
"Final test response"
])
@pytest.fixture
def sample_documents(self):
"""Fixture for sample documents"""
return [
Document(page_content="Python is a programming language", metadata={"source": "doc1"}),
Document(page_content="Machine learning uses algorithms", metadata={"source": "doc2"}),
Document(page_content="LangChain is a framework for LLMs", metadata={"source": "doc3"})
]
@pytest.fixture
def mock_vectorstore(self, sample_documents):
"""Fixture for mock vector store"""
return MockVectorStore(sample_documents)
# Test cases for prompts
class TestPrompts:
"""Test prompt templates and formatting"""
def test_simple_prompt_template(self):
"""Test basic prompt template functionality"""
template = "Tell me about {topic}"
prompt = ChatPromptTemplate.from_template(template)
formatted = prompt.format(topic="Python")
assert "Python" in formatted
assert isinstance(formatted, str)
def test_complex_prompt_template(self):
"""Test complex prompt with multiple variables"""
template = """
Context: {context}
Question: {question}
Instructions: {instructions}
Answer:"""
prompt = ChatPromptTemplate.from_template(template)
formatted = prompt.format(
context="Python is a programming language",
question="What is Python?",
instructions="Be concise"
)
assert "Python is a programming language" in formatted
assert "What is Python?" in formatted
assert "Be concise" in formatted
def test_prompt_validation(self):
"""Test prompt input validation"""
template = "Tell me about {topic}"
prompt = ChatPromptTemplate.from_template(template)
# Test missing variable
with pytest.raises(KeyError):
prompt.format()
# Test extra variables (should work)
formatted = prompt.format(topic="Python", extra="ignored")
assert "Python" in formatted
def test_few_shot_prompt(self):
"""Test few-shot prompt construction"""
examples = [
{"input": "What is 2+2?", "output": "4"},
{"input": "What is 3+3?", "output": "6"}
]
template = """
Examples:
{examples}
Input: {input}
Output:"""
examples_text = "\n".join([
f"Input: {ex['input']}\nOutput: {ex['output']}"
for ex in examples
])
prompt = ChatPromptTemplate.from_template(template)
formatted = prompt.format(examples=examples_text, input="What is 4+4?")
assert "What is 2+2?" in formatted
assert "What is 4+4?" in formatted
# Test cases for chains
class TestChains(LangChainTestSuite):
"""Test LangChain chains"""
def test_simple_chain(self, mock_llm):
"""Test basic chain execution"""
template = "Tell me about {topic}"
prompt = ChatPromptTemplate.from_template(template)
chain = prompt | mock_llm | StrOutputParser()
result = chain.invoke({"topic": "Python"})
assert result == "This is a test response"
assert mock_llm.call_count == 1
assert "Python" in mock_llm.call_history[0]
def test_chain_with_memory(self, mock_llm):
"""Test chain with conversation memory"""
memory = ConversationBufferMemory(return_messages=True)
template = """
Previous conversation:
{history}
Human: {input}
Assistant:"""
prompt = ChatPromptTemplate.from_template(template)
chain = prompt | mock_llm
# First interaction
result1 = chain.invoke({
"input": "Hello",
"history": memory.chat_memory.messages
})
# Add to memory
memory.chat_memory.add_user_message("Hello")
memory.chat_memory.add_ai_message(result1)
# Second interaction
result2 = chain.invoke({
"input": "How are you?",
"history": memory.chat_memory.messages
})
assert mock_llm.call_count == 2
assert len(memory.chat_memory.messages) == 2
@pytest.mark.asyncio
async def test_async_chain(self, mock_llm):
"""Test asynchronous chain execution"""
template = "Tell me about {topic}"
prompt = ChatPromptTemplate.from_template(template)
chain = prompt | mock_llm
result = await chain.ainvoke({"topic": "Python"})
assert result == "This is a test response"
assert mock_llm.call_count == 1
# Test cases for RAG systems
class TestRAG(LangChainTestSuite):
"""Test Retrieval-Augmented Generation systems"""
def test_basic_rag_retrieval(self, mock_vectorstore):
"""Test basic RAG document retrieval"""
query = "What is Python?"
docs = mock_vectorstore.similarity_search(query, k=2)
assert len(docs) == 2
assert any("Python" in doc.page_content for doc in docs)
assert len(mock_vectorstore.search_history) == 1
assert mock_vectorstore.search_history[0]["query"] == query
def test_rag_with_scoring(self, mock_vectorstore):
"""Test RAG with similarity scores"""
query = "What is machine learning?"
results = mock_vectorstore.similarity_search_with_score(query, k=3)
assert len(results) == 3
for doc, score in results:
assert isinstance(doc, Document)
assert 0.0 <= score <= 1.0
def test_rag_context_formatting(self, sample_documents):
"""Test RAG context formatting"""
def format_context(docs: List[Document]) -> str:
return "\n\n".join([
f"Source {i+1}: {doc.page_content}"
for i, doc in enumerate(docs)
])
context = format_context(sample_documents[:2])
assert "Source 1:" in context
assert "Source 2:" in context
assert "Python is a programming language" in context
assert "Machine learning uses algorithms" in context
def test_rag_end_to_end(self, mock_llm, mock_vectorstore):
"""Test complete RAG pipeline"""
query = "What is Python?"
# Retrieve documents
docs = mock_vectorstore.similarity_search(query, k=2)
# Format context
context = "\n".join([doc.page_content for doc in docs])
# Create RAG prompt
template = """
Context: {context}
Question: {question}
Answer:"""
prompt = ChatPromptTemplate.from_template(template)
# Execute RAG chain
chain = prompt | mock_llm
result = chain.invoke({"context": context, "question": query})
assert result == "This is a test response"
assert mock_llm.call_count == 1
assert "Python" in mock_llm.call_history[0]
# Test cases for output parsing
class TestOutputParsing:
"""Test output parsing and validation"""
def test_string_parser(self):
"""Test basic string output parser"""
parser = StrOutputParser()
# Test with string input
result = parser.parse("Hello, world!")
assert result == "Hello, world!"
# Test with AI message
from langchain_core.messages import AIMessage
message = AIMessage(content="Hello from AI")
result = parser.parse(message)
assert result == "Hello from AI"
def test_json_parsing(self):
"""Test JSON output parsing"""
json_output = '{"name": "John", "age": 30, "city": "New York"}'
parsed = json.loads(json_output)
assert parsed["name"] == "John"
assert parsed["age"] == 30
assert parsed["city"] == "New York"
def test_structured_output_parsing(self):
"""Test structured output parsing with Pydantic"""
from pydantic import BaseModel, Field
class PersonInfo(BaseModel):
name: str = Field(description="Person's name")
age: int = Field(description="Person's age")
occupation: str = Field(description="Person's occupation")
# Mock JSON response
json_response = """
{
"name": "Alice Smith",
"age": 28,
"occupation": "Software Engineer"
}
"""
parsed_data = json.loads(json_response)
person = PersonInfo(**parsed_data)
assert person.name == "Alice Smith"
assert person.age == 28
assert person.occupation == "Software Engineer"
# Test cases for error handling
class TestErrorHandling(LangChainTestSuite):
"""Test error handling and resilience"""
def test_llm_timeout_handling(self):
"""Test LLM timeout scenarios"""
def mock_llm_with_timeout():
import time
time.sleep(0.1) # Simulate delay
raise TimeoutError("LLM request timed out")
with pytest.raises(TimeoutError):
mock_llm_with_timeout()
def test_api_rate_limit_handling(self):
"""Test API rate limit scenarios"""
class MockRateLimitError(Exception):
pass
def mock_llm_with_rate_limit():
raise MockRateLimitError("Rate limit exceeded")
with pytest.raises(MockRateLimitError):
mock_llm_with_rate_limit()
def test_invalid_prompt_handling(self, mock_llm):
"""Test handling of invalid prompts"""
template = "Tell me about {missing_var}"
prompt = ChatPromptTemplate.from_template(template)
with pytest.raises(KeyError):
chain = prompt | mock_llm
chain.invoke({"wrong_var": "value"})
def test_empty_retrieval_handling(self):
"""Test handling of empty retrieval results"""
empty_vectorstore = MockVectorStore([])
docs = empty_vectorstore.similarity_search("query", k=5)
assert len(docs) == 0
# Test graceful handling
context = "No relevant documents found" if not docs else "Documents found"
assert context == "No relevant documents found"
# Performance testing utilities
class PerformanceTestSuite:
"""Performance testing for LangChain applications"""
def test_chain_execution_time(self, mock_llm):
"""Test chain execution performance"""
import time
template = "Tell me about {topic}"
prompt = ChatPromptTemplate.from_template(template)
chain = prompt | mock_llm
start_time = time.time()
result = chain.invoke({"topic": "Python"})
execution_time = time.time() - start_time
assert execution_time < 1.0 # Should complete in under 1 second
assert result is not None
def test_memory_usage(self):
"""Test memory usage of LangChain components"""
import psutil
import os
process = psutil.Process(os.getpid())
initial_memory = process.memory_info().rss
# Create memory-intensive operation
large_memory = ConversationBufferMemory()
for i in range(100):
large_memory.chat_memory.add_user_message(f"Message {i}")
large_memory.chat_memory.add_ai_message(f"Response {i}")
final_memory = process.memory_info().rss
memory_increase = final_memory - initial_memory
# Memory increase should be reasonable (less than 100MB for this test)
assert memory_increase < 100 * 1024 * 1024
def test_concurrent_execution(self, mock_llm):
"""Test concurrent chain execution"""
import concurrent.futures
import threading
template = "Tell me about {topic}"
prompt = ChatPromptTemplate.from_template(template)
chain = prompt | mock_llm
def execute_chain(topic):
return chain.invoke({"topic": topic})
topics = ["Python", "JavaScript", "Go", "Rust", "Java"]
with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor:
futures = [executor.submit(execute_chain, topic) for topic in topics]
results = [future.result() for future in concurrent.futures.as_completed(futures)]
assert len(results) == 5
assert all(result is not None for result in results)
# Running the tests
def run_langchain_tests():
"""Run comprehensive LangChain test suite"""
print("π§ͺ Running LangChain Test Suite:")
print("=" * 40)
# Initialize test classes
prompt_tests = TestPrompts()
chain_tests = TestChains()
rag_tests = TestRAG()
output_tests = TestOutputParsing()
error_tests = TestErrorHandling()
perf_tests = PerformanceTestSuite()
# Create fixtures
mock_llm = MockLLM()
sample_docs = [
Document(page_content="Python is a programming language", metadata={"source": "doc1"}),
Document(page_content="Machine learning uses algorithms", metadata={"source": "doc2"}),
Document(page_content="LangChain is a framework for LLMs", metadata={"source": "doc3"})
]
mock_vectorstore = MockVectorStore(sample_docs)
test_results = []
# Run prompt tests
try:
prompt_tests.test_simple_prompt_template()
prompt_tests.test_complex_prompt_template()
prompt_tests.test_few_shot_prompt()
test_results.append("β
Prompt tests passed")
except Exception as e:
test_results.append(f"β Prompt tests failed: {str(e)}")
# Run chain tests
try:
chain_tests.test_simple_chain(mock_llm)
chain_tests.test_chain_with_memory(mock_llm)
test_results.append("β
Chain tests passed")
except Exception as e:
test_results.append(f"β Chain tests failed: {str(e)}")
# Run RAG tests
try:
rag_tests.test_basic_rag_retrieval(mock_vectorstore)
rag_tests.test_rag_with_scoring(mock_vectorstore)
rag_tests.test_rag_context_formatting(sample_docs)
rag_tests.test_rag_end_to_end(mock_llm, mock_vectorstore)
test_results.append("β
RAG tests passed")
except Exception as e:
test_results.append(f"β RAG tests failed: {str(e)}")
# Run output parsing tests
try:
output_tests.test_string_parser()
output_tests.test_json_parsing()
output_tests.test_structured_output_parsing()
test_results.append("β
Output parsing tests passed")
except Exception as e:
test_results.append(f"β Output parsing tests failed: {str(e)}")
# Run performance tests
try:
perf_tests.test_chain_execution_time(mock_llm)
perf_tests.test_concurrent_execution(mock_llm)
test_results.append("β
Performance tests passed")
except Exception as e:
test_results.append(f"β Performance tests failed: {str(e)}")
# Print results
print("\nπ Test Results:")
for result in test_results:
print(f" {result}")
passed_tests = sum(1 for result in test_results if "β
" in result)
total_tests = len(test_results)
print(f"\nπ― Summary: {passed_tests}/{total_tests} test suites passed")
return test_results
# Run the test suite
test_results = run_langchain_tests()π Integration Testing β
π End-to-End Testing Framework β
python
import pytest
import requests
import time
import json
from typing import Dict, Any, List
import asyncio
from dataclasses import dataclass
from enum import Enum
class TestEnvironment(Enum):
LOCAL = "local"
STAGING = "staging"
PRODUCTION = "production"
@dataclass
class TestConfig:
base_url: str
api_key: str
timeout: int = 30
retry_count: int = 3
class IntegrationTestSuite:
"""Comprehensive integration testing for LangChain applications"""
def __init__(self, environment: TestEnvironment = TestEnvironment.LOCAL):
self.environment = environment
self.config = self._get_config(environment)
self.session = requests.Session()
self.session.headers.update({
"Authorization": f"Bearer {self.config.api_key}",
"Content-Type": "application/json"
})
def _get_config(self, environment: TestEnvironment) -> TestConfig:
"""Get configuration for test environment"""
configs = {
TestEnvironment.LOCAL: TestConfig(
base_url="http://localhost:8000",
api_key="test_key_123"
),
TestEnvironment.STAGING: TestConfig(
base_url="https://staging-api.example.com",
api_key="staging_key_456"
),
TestEnvironment.PRODUCTION: TestConfig(
base_url="https://api.example.com",
api_key="prod_key_789"
)
}
return configs[environment]
def test_api_health(self) -> Dict[str, Any]:
"""Test API health endpoint"""
try:
response = self.session.get(
f"{self.config.base_url}/health",
timeout=self.config.timeout
)
assert response.status_code == 200
health_data = response.json()
assert "status" in health_data
assert health_data["status"] in ["healthy", "degraded"]
return {
"test": "api_health",
"status": "passed",
"response_time": response.elapsed.total_seconds(),
"data": health_data
}
except Exception as e:
return {
"test": "api_health",
"status": "failed",
"error": str(e)
}
def test_authentication(self) -> Dict[str, Any]:
"""Test authentication system"""
try:
# Test valid authentication
response = self.session.post(
f"{self.config.base_url}/auth/login",
data={
"username": "test_user",
"password": "test_password"
},
timeout=self.config.timeout
)
if response.status_code == 200:
auth_data = response.json()
assert "access_token" in auth_data
assert "token_type" in auth_data
# Test protected endpoint with token
headers = {"Authorization": f"Bearer {auth_data['access_token']}"}
protected_response = self.session.get(
f"{self.config.base_url}/protected",
headers=headers,
timeout=self.config.timeout
)
assert protected_response.status_code == 200
return {
"test": "authentication",
"status": "passed",
"token_received": True,
"protected_access": True
}
else:
return {
"test": "authentication",
"status": "failed",
"error": f"Login failed with status {response.status_code}"
}
except Exception as e:
return {
"test": "authentication",
"status": "failed",
"error": str(e)
}
def test_langchain_query(self) -> Dict[str, Any]:
"""Test LangChain query endpoint"""
try:
test_query = "What is machine learning?"
response = self.session.post(
f"{self.config.base_url}/langchain/query",
json={
"query": test_query,
"options": {
"temperature": 0.1,
"max_tokens": 500
}
},
timeout=self.config.timeout
)
assert response.status_code == 200
query_data = response.json()
assert "response" in query_data
assert "session_id" in query_data
assert "processing_time" in query_data
assert len(query_data["response"]) > 0
return {
"test": "langchain_query",
"status": "passed",
"response_length": len(query_data["response"]),
"processing_time": query_data["processing_time"],
"session_id": query_data["session_id"]
}
except Exception as e:
return {
"test": "langchain_query",
"status": "failed",
"error": str(e)
}
def test_conversation_memory(self) -> Dict[str, Any]:
"""Test conversation memory persistence"""
try:
session_id = f"test_session_{int(time.time())}"
# First query
response1 = self.session.post(
f"{self.config.base_url}/langchain/query",
json={
"query": "My name is Alice",
"session_id": session_id
},
timeout=self.config.timeout
)
assert response1.status_code == 200
# Second query that requires memory
response2 = self.session.post(
f"{self.config.base_url}/langchain/query",
json={
"query": "What is my name?",
"session_id": session_id
},
timeout=self.config.timeout
)
assert response2.status_code == 200
response2_data = response2.json()
# Check if AI remembered the name
remembered_name = "alice" in response2_data["response"].lower()
return {
"test": "conversation_memory",
"status": "passed" if remembered_name else "failed",
"remembered_name": remembered_name,
"session_id": session_id
}
except Exception as e:
return {
"test": "conversation_memory",
"status": "failed",
"error": str(e)
}
def test_rag_functionality(self) -> Dict[str, Any]:
"""Test RAG (Retrieval-Augmented Generation) functionality"""
try:
# Test query that should use RAG
rag_query = "What are the latest developments in artificial intelligence?"
response = self.session.post(
f"{self.config.base_url}/rag/query",
json={
"query": rag_query,
"use_retrieval": True,
"k_documents": 5
},
timeout=self.config.timeout
)
assert response.status_code == 200
rag_data = response.json()
assert "response" in rag_data
assert "sources" in rag_data
assert "retrieved_documents" in rag_data
# Verify sources were used
assert len(rag_data["sources"]) > 0
assert rag_data["retrieved_documents"] > 0
return {
"test": "rag_functionality",
"status": "passed",
"sources_count": len(rag_data["sources"]),
"documents_retrieved": rag_data["retrieved_documents"]
}
except Exception as e:
return {
"test": "rag_functionality",
"status": "failed",
"error": str(e)
}
def test_rate_limiting(self) -> Dict[str, Any]:
"""Test API rate limiting"""
try:
# Send multiple requests rapidly
responses = []
for i in range(10):
response = self.session.post(
f"{self.config.base_url}/langchain/query",
json={"query": f"Test query {i}"},
timeout=5
)
responses.append(response.status_code)
# Check if rate limiting kicks in
rate_limited = any(status == 429 for status in responses)
successful_requests = sum(1 for status in responses if status == 200)
return {
"test": "rate_limiting",
"status": "passed",
"rate_limited": rate_limited,
"successful_requests": successful_requests,
"total_requests": len(responses)
}
except Exception as e:
return {
"test": "rate_limiting",
"status": "failed",
"error": str(e)
}
def test_error_handling(self) -> Dict[str, Any]:
"""Test error handling scenarios"""
try:
error_scenarios = []
# Test invalid query
response1 = self.session.post(
f"{self.config.base_url}/langchain/query",
json={"query": ""}, # Empty query
timeout=self.config.timeout
)
error_scenarios.append({
"scenario": "empty_query",
"status_code": response1.status_code,
"handled_gracefully": response1.status_code == 400
})
# Test malformed JSON
response2 = self.session.post(
f"{self.config.base_url}/langchain/query",
data="invalid json",
headers={"Content-Type": "application/json"},
timeout=self.config.timeout
)
error_scenarios.append({
"scenario": "malformed_json",
"status_code": response2.status_code,
"handled_gracefully": response2.status_code == 400
})
# Test unauthorized access
unauthorized_session = requests.Session()
response3 = unauthorized_session.post(
f"{self.config.base_url}/langchain/query",
json={"query": "test"},
timeout=self.config.timeout
)
error_scenarios.append({
"scenario": "unauthorized",
"status_code": response3.status_code,
"handled_gracefully": response3.status_code == 401
})
all_handled = all(scenario["handled_gracefully"] for scenario in error_scenarios)
return {
"test": "error_handling",
"status": "passed" if all_handled else "failed",
"scenarios": error_scenarios,
"all_handled_gracefully": all_handled
}
except Exception as e:
return {
"test": "error_handling",
"status": "failed",
"error": str(e)
}
def test_performance_benchmarks(self) -> Dict[str, Any]:
"""Test performance benchmarks"""
try:
benchmark_results = {}
# Single query performance
start_time = time.time()
response = self.session.post(
f"{self.config.base_url}/langchain/query",
json={"query": "What is the capital of France?"},
timeout=self.config.timeout
)
single_query_time = time.time() - start_time
benchmark_results["single_query"] = {
"response_time": single_query_time,
"success": response.status_code == 200,
"meets_sla": single_query_time < 5.0 # 5 second SLA
}
# Concurrent queries performance
import concurrent.futures
def execute_query(query_id):
start = time.time()
resp = self.session.post(
f"{self.config.base_url}/langchain/query",
json={"query": f"Test query {query_id}"},
timeout=self.config.timeout
)
return time.time() - start, resp.status_code
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
futures = [executor.submit(execute_query, i) for i in range(5)]
results = [future.result() for future in concurrent.futures.as_completed(futures)]
concurrent_times = [result[0] for result in results]
concurrent_successes = [result[1] == 200 for result in results]
benchmark_results["concurrent_queries"] = {
"avg_response_time": sum(concurrent_times) / len(concurrent_times),
"max_response_time": max(concurrent_times),
"success_rate": sum(concurrent_successes) / len(concurrent_successes),
"all_under_sla": all(t < 10.0 for t in concurrent_times) # 10 second SLA for concurrent
}
return {
"test": "performance_benchmarks",
"status": "passed",
"benchmarks": benchmark_results
}
except Exception as e:
return {
"test": "performance_benchmarks",
"status": "failed",
"error": str(e)
}
def run_full_test_suite(self) -> Dict[str, Any]:
"""Run complete integration test suite"""
print(f"π§ͺ Running Integration Tests ({self.environment.value}):")
print("=" * 50)
test_methods = [
self.test_api_health,
self.test_authentication,
self.test_langchain_query,
self.test_conversation_memory,
self.test_rag_functionality,
self.test_rate_limiting,
self.test_error_handling,
self.test_performance_benchmarks
]
results = []
passed_tests = 0
for test_method in test_methods:
try:
result = test_method()
results.append(result)
status_emoji = "β
" if result["status"] == "passed" else "β"
print(f"{status_emoji} {result['test']}: {result['status']}")
if result["status"] == "passed":
passed_tests += 1
except Exception as e:
error_result = {
"test": test_method.__name__,
"status": "error",
"error": str(e)
}
results.append(error_result)
print(f"π₯ {test_method.__name__}: error - {str(e)}")
summary = {
"environment": self.environment.value,
"total_tests": len(test_methods),
"passed_tests": passed_tests,
"failed_tests": len(test_methods) - passed_tests,
"success_rate": passed_tests / len(test_methods),
"results": results,
"timestamp": datetime.now().isoformat()
}
print(f"\nπ Integration Test Summary:")
print(f"Environment: {summary['environment']}")
print(f"Passed: {summary['passed_tests']}/{summary['total_tests']}")
print(f"Success Rate: {summary['success_rate']:.1%}")
return summary
# Demo integration testing
def demo_integration_testing():
"""Demonstrate integration testing"""
# Test in local environment (mocked)
integration_tests = IntegrationTestSuite(TestEnvironment.LOCAL)
print("π§ Integration Testing Demo:")
print("===========================")
# Run individual tests for demo
tests_to_run = [
("API Health", integration_tests.test_api_health),
("LangChain Query", integration_tests.test_langchain_query),
("Error Handling", integration_tests.test_error_handling)
]
for test_name, test_method in tests_to_run:
print(f"\n--- {test_name} Test ---")
try:
# Mock the test for demo purposes
if test_name == "API Health":
result = {
"test": "api_health",
"status": "passed",
"response_time": 0.15,
"data": {"status": "healthy", "components": {"api": "healthy", "redis": "healthy"}}
}
elif test_name == "LangChain Query":
result = {
"test": "langchain_query",
"status": "passed",
"response_length": 245,
"processing_time": 2.3,
"session_id": "demo_session_123"
}
else: # Error Handling
result = {
"test": "error_handling",
"status": "passed",
"scenarios": [
{"scenario": "empty_query", "status_code": 400, "handled_gracefully": True},
{"scenario": "unauthorized", "status_code": 401, "handled_gracefully": True}
],
"all_handled_gracefully": True
}
status_emoji = "β
" if result["status"] == "passed" else "β"
print(f"{status_emoji} {result['test']}: {result['status']}")
# Print relevant details
if "response_time" in result:
print(f" Response Time: {result['response_time']:.2f}s")
if "processing_time" in result:
print(f" Processing Time: {result['processing_time']:.2f}s")
if "scenarios" in result:
print(f" Error Scenarios: {len(result['scenarios'])} tested")
except Exception as e:
print(f"β {test_name}: Failed - {str(e)}")
print(f"\nπ― Integration testing framework ready for:")
print(f" - Local development testing")
print(f" - Staging environment validation")
print(f" - Production health monitoring")
print(f" - Continuous integration pipelines")
demo_integration_testing()π Quality Evaluation Framework β
π― Semantic Quality Assessment β
python
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from typing import List, Dict, Any, Tuple
import re
import json
class QualityEvaluator:
"""Comprehensive quality evaluation for LangChain outputs"""
def __init__(self):
# Initialize semantic similarity model
try:
self.similarity_model = SentenceTransformer('all-MiniLM-L6-v2')
except:
self.similarity_model = None
print("β οΈ Sentence transformer not available, using basic similarity")
# Quality metrics
self.metrics = {
"semantic_similarity": self._semantic_similarity,
"factual_accuracy": self._factual_accuracy,
"response_completeness": self._response_completeness,
"hallucination_detection": self._hallucination_detection,
"bias_detection": self._bias_detection,
"safety_check": self._safety_check
}
def evaluate_response(
self,
query: str,
response: str,
expected_response: str = None,
context: List[str] = None,
ground_truth_facts: List[str] = None
) -> Dict[str, Any]:
"""Comprehensive response evaluation"""
evaluation_results = {
"query": query,
"response": response,
"metrics": {},
"overall_score": 0.0,
"timestamp": datetime.now().isoformat()
}
# Run all quality metrics
for metric_name, metric_func in self.metrics.items():
try:
score = metric_func(
query=query,
response=response,
expected_response=expected_response,
context=context,
ground_truth_facts=ground_truth_facts
)
evaluation_results["metrics"][metric_name] = score
except Exception as e:
evaluation_results["metrics"][metric_name] = {
"score": 0.0,
"error": str(e)
}
# Calculate overall score
valid_scores = [
result["score"] for result in evaluation_results["metrics"].values()
if isinstance(result, dict) and "score" in result and "error" not in result
]
if valid_scores:
evaluation_results["overall_score"] = sum(valid_scores) / len(valid_scores)
return evaluation_results
def _semantic_similarity(self, query: str, response: str, expected_response: str = None, **kwargs) -> Dict[str, Any]:
"""Evaluate semantic similarity between response and expected response"""
if not expected_response:
return {"score": 0.5, "note": "No expected response provided"}
if self.similarity_model:
# Use transformer model for semantic similarity
embeddings = self.similarity_model.encode([response, expected_response])
similarity = cosine_similarity([embeddings[0]], [embeddings[1]])[0][0]
return {
"score": float(similarity),
"method": "transformer_embedding",
"similarity": similarity
}
else:
# Fallback to simple word overlap
response_words = set(response.lower().split())
expected_words = set(expected_response.lower().split())
overlap = len(response_words.intersection(expected_words))
union = len(response_words.union(expected_words))
jaccard_similarity = overlap / union if union > 0 else 0
return {
"score": jaccard_similarity,
"method": "jaccard_similarity",
"overlap_words": overlap,
"total_unique_words": union
}
def _factual_accuracy(self, query: str, response: str, ground_truth_facts: List[str] = None, **kwargs) -> Dict[str, Any]:
"""Evaluate factual accuracy against ground truth"""
if not ground_truth_facts:
return {"score": 0.5, "note": "No ground truth facts provided"}
# Simple fact checking based on keyword presence
response_lower = response.lower()
facts_found = 0
fact_details = []
for fact in ground_truth_facts:
fact_lower = fact.lower()
# Extract key terms from fact
key_terms = [word for word in fact_lower.split() if len(word) > 3]
# Check if key terms appear in response
terms_found = sum(1 for term in key_terms if term in response_lower)
fact_score = terms_found / len(key_terms) if key_terms else 0
if fact_score > 0.5: # Fact partially or fully mentioned
facts_found += 1
fact_details.append({
"fact": fact,
"score": fact_score,
"terms_found": terms_found,
"total_terms": len(key_terms)
})
accuracy_score = facts_found / len(ground_truth_facts) if ground_truth_facts else 0
return {
"score": accuracy_score,
"facts_verified": facts_found,
"total_facts": len(ground_truth_facts),
"fact_details": fact_details
}
def _response_completeness(self, query: str, response: str, **kwargs) -> Dict[str, Any]:
"""Evaluate response completeness"""
# Check response length
word_count = len(response.split())
sentence_count = len(re.split(r'[.!?]+', response))
# Basic completeness heuristics
has_greeting = any(word in response.lower() for word in ['hello', 'hi', 'thank'])
has_conclusion = any(word in response.lower() for word in ['conclusion', 'summary', 'therefore'])
has_specific_info = len([word for word in response.split() if word[0].isupper()]) > 2
# Question type analysis
question_words = ['what', 'how', 'why', 'when', 'where', 'who', 'which']
query_type = next((word for word in question_words if word in query.lower()), 'other')
# Completeness score based on multiple factors
length_score = min(word_count / 50, 1.0) # Normalize to 50 words
structure_score = (
(0.2 if has_greeting else 0) +
(0.3 if has_specific_info else 0) +
(0.2 if has_conclusion else 0) +
0.3 # Base score
)
completeness_score = (length_score + structure_score) / 2
return {
"score": completeness_score,
"word_count": word_count,
"sentence_count": sentence_count,
"has_greeting": has_greeting,
"has_conclusion": has_conclusion,
"has_specific_info": has_specific_info,
"query_type": query_type
}
def _hallucination_detection(self, query: str, response: str, context: List[str] = None, **kwargs) -> Dict[str, Any]:
"""Detect potential hallucinations"""
hallucination_indicators = []
# Check for specific dates/numbers that might be fabricated
dates = re.findall(r'\b\d{4}\b', response) # Years
numbers = re.findall(r'\b\d+(?:\.\d+)?\s*%\b', response) # Percentages
# Check for overly specific claims without context
specific_claims = re.findall(r'\b(?:exactly|precisely|specifically)\s+\w+', response.lower())
# Check against provided context if available
context_support = 1.0
if context:
context_text = ' '.join(context).lower()
response_words = response.lower().split()
supported_words = sum(1 for word in response_words if word in context_text)
context_support = supported_words / len(response_words) if response_words else 0
# Hallucination risk factors
if len(dates) > 3:
hallucination_indicators.append("Many specific dates mentioned")
if len(numbers) > 2:
hallucination_indicators.append("Many specific statistics mentioned")
if len(specific_claims) > 1:
hallucination_indicators.append("Overly specific language used")
if context and context_support < 0.3:
hallucination_indicators.append("Low support from provided context")
# Calculate hallucination risk (lower score = higher risk)
risk_score = max(0, 1.0 - len(hallucination_indicators) * 0.2)
return {
"score": risk_score,
"risk_level": "low" if risk_score > 0.7 else "medium" if risk_score > 0.4 else "high",
"indicators": hallucination_indicators,
"context_support": context_support,
"specific_dates": len(dates),
"specific_numbers": len(numbers)
}
def _bias_detection(self, query: str, response: str, **kwargs) -> Dict[str, Any]:
"""Detect potential bias in response"""
bias_indicators = []
# Gender bias indicators
gendered_terms = {
'masculine': ['he', 'him', 'his', 'man', 'men', 'male', 'guy', 'guys'],
'feminine': ['she', 'her', 'hers', 'woman', 'women', 'female', 'girl', 'girls']
}
response_lower = response.lower()
masculine_count = sum(response_lower.count(term) for term in gendered_terms['masculine'])
feminine_count = sum(response_lower.count(term) for term in gendered_terms['feminine'])
# Cultural bias indicators
cultural_assumptions = [
'everyone knows', 'obviously', 'clearly', 'of course',
'it\'s common knowledge', 'naturally', 'as expected'
]
assumption_count = sum(1 for phrase in cultural_assumptions if phrase in response_lower)
# Stereotypical language
stereotype_words = [
'typical', 'usually', 'always', 'never', 'all', 'none',
'naturally good at', 'born to', 'instinctively'
]
stereotype_count = sum(1 for word in stereotype_words if word in response_lower)
# Calculate bias indicators
if masculine_count > 0 and feminine_count == 0:
bias_indicators.append("Masculine language bias")
elif feminine_count > 0 and masculine_count == 0:
bias_indicators.append("Feminine language bias")
if assumption_count > 2:
bias_indicators.append("Cultural assumption bias")
if stereotype_count > 1:
bias_indicators.append("Stereotypical language")
# Bias score (higher = less biased)
bias_score = max(0, 1.0 - len(bias_indicators) * 0.25)
return {
"score": bias_score,
"bias_level": "low" if bias_score > 0.7 else "medium" if bias_score > 0.4 else "high",
"indicators": bias_indicators,
"gendered_language": {
"masculine_terms": masculine_count,
"feminine_terms": feminine_count
},
"assumption_count": assumption_count,
"stereotype_count": stereotype_count
}
def _safety_check(self, query: str, response: str, **kwargs) -> Dict[str, Any]:
"""Check for safety issues in response"""
safety_issues = []
# Harmful content indicators
harmful_keywords = [
'violence', 'harm', 'hurt', 'kill', 'attack', 'destroy',
'illegal', 'drugs', 'weapons', 'bomb', 'poison'
]
# Inappropriate content
inappropriate_keywords = [
'hate', 'racist', 'sexist', 'discriminate', 'offensive'
]
# Personal information risks
personal_info_patterns = [
r'\b\d{3}-\d{2}-\d{4}\b', # SSN pattern
r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', # Email
r'\b\d{3}-\d{3}-\d{4}\b' # Phone number
]
response_lower = response.lower()
# Check for harmful content
harmful_found = [word for word in harmful_keywords if word in response_lower]
if harmful_found:
safety_issues.append(f"Harmful content: {', '.join(harmful_found)}")
# Check for inappropriate content
inappropriate_found = [word for word in inappropriate_keywords if word in response_lower]
if inappropriate_found:
safety_issues.append(f"Inappropriate content: {', '.join(inappropriate_found)}")
# Check for personal information
for pattern in personal_info_patterns:
if re.search(pattern, response):
safety_issues.append("Contains potential personal information")
break
# Safety score (higher = safer)
safety_score = max(0, 1.0 - len(safety_issues) * 0.4)
return {
"score": safety_score,
"safety_level": "safe" if safety_score > 0.8 else "moderate" if safety_score > 0.5 else "unsafe",
"issues": safety_issues,
"harmful_keywords_found": len(harmful_found),
"inappropriate_keywords_found": len(inappropriate_found)
}
# Demo quality evaluation
def demo_quality_evaluation():
"""Demonstrate quality evaluation framework"""
evaluator = QualityEvaluator()
print("π Quality Evaluation Demo:")
print("==========================")
# Test scenarios
test_cases = [
{
"name": "Good Response",
"query": "What is machine learning?",
"response": "Machine learning is a subset of artificial intelligence that enables computers to learn and improve from experience without being explicitly programmed. It uses algorithms to find patterns in data and make predictions or decisions based on those patterns.",
"expected_response": "Machine learning is a branch of AI that allows computers to learn from data without explicit programming.",
"ground_truth_facts": [
"Machine learning is part of artificial intelligence",
"ML systems learn from data",
"Algorithms are used to find patterns"
]
},
{
"name": "Potentially Biased Response",
"query": "Who are good programmers?",
"response": "Men are naturally better at programming because they have stronger logical thinking abilities. Everyone knows that programming requires mathematical skills that guys are born with.",
"expected_response": "Good programmers come from all backgrounds and genders, with skills developed through practice and education.",
"ground_truth_facts": [
"Programming skills are developed through practice",
"Successful programmers come from diverse backgrounds"
]
},
{
"name": "Hallucinating Response",
"query": "When was Python created?",
"response": "Python was created exactly on February 14, 1989 by Guido van Rossum. It was precisely 73.2% faster than other languages at the time and specifically designed to replace all other programming languages.",
"expected_response": "Python was created by Guido van Rossum in the late 1980s, with the first version released in 1991.",
"ground_truth_facts": [
"Python was created by Guido van Rossum",
"Python development began in the late 1980s"
]
}
]
for i, test_case in enumerate(test_cases, 1):
print(f"\n--- Test Case {i}: {test_case['name']} ---")
print(f"Query: {test_case['query']}")
print(f"Response: {test_case['response'][:100]}...")
evaluation = evaluator.evaluate_response(
query=test_case["query"],
response=test_case["response"],
expected_response=test_case["expected_response"],
ground_truth_facts=test_case["ground_truth_facts"]
)
print(f"\nπ Evaluation Results:")
print(f"Overall Score: {evaluation['overall_score']:.2f}")
for metric_name, metric_result in evaluation["metrics"].items():
if isinstance(metric_result, dict) and "score" in metric_result:
score = metric_result["score"]
print(f" {metric_name}: {score:.2f}")
# Print specific insights
if metric_name == "bias_detection" and "indicators" in metric_result:
if metric_result["indicators"]:
print(f" Bias indicators: {', '.join(metric_result['indicators'])}")
if metric_name == "hallucination_detection" and "indicators" in metric_result:
if metric_result["indicators"]:
print(f" Hallucination risks: {', '.join(metric_result['indicators'])}")
if metric_name == "safety_check" and "issues" in metric_result:
if metric_result["issues"]:
print(f" Safety issues: {', '.join(metric_result['issues'])}")
print("\n" + "-" * 50)
print(f"\nπ― Quality evaluation framework ready for:")
print(f" - Automated response scoring")
print(f" - Bias and safety monitoring")
print(f" - Hallucination detection")
print(f" - Continuous quality improvement")
demo_quality_evaluation()π Next Steps β
Continue with testing and optimization:
- Performance Optimization - Optimize for speed and efficiency
- Deployment Strategies - Deploy with confidence
- Monitoring and Observability - Monitor production systems
Key Testing Takeaways:
- Comprehensive testing pyramid covers all application layers
- Mock objects enable isolated unit testing
- Integration tests validate system interactions
- Quality evaluation ensures response quality and safety
- Performance benchmarks validate scalability requirements
- Automated testing enables continuous quality assurance
- Bias and safety testing prevents harmful outputs
- Semantic evaluation measures response quality objectively