使用 Llamaindex 进行检索增强生成
来源 examples/online_serving/retrieval_augmented_generation_with_llamaindex.py。
# SPDX-License-Identifier: Apache-2.0
"""
RAG (Retrieval Augmented Generation) Implementation with LlamaIndex
================================================================
This script demonstrates a RAG system using:
- LlamaIndex: For document indexing and retrieval
- Milvus: As vector store backend
- vLLM: For embedding and text generation
Features:
1. Document Loading & Processing
2. Embedding & Storage
3. Query Processing
Requirements:
1. Install dependencies:
pip install llama-index llama-index-readers-web \
llama-index-llms-openai-like \
llama-index-embeddings-openai-like \
llama-index-vector-stores-milvus \
2. Start services:
# Start embedding service (port 8000)
vllm serve ssmits/Qwen2-7B-Instruct-embed-base
# Start chat service (port 8001)
vllm serve qwen/Qwen1.5-0.5B-Chat --port 8001
Usage:
python retrieval_augmented_generation_with_llamaindex.py
Notes:
- Ensure both vLLM services are running before executing
- Default ports: 8000 (embedding), 8001 (chat)
- First run may take time to download models
"""
import argparse
from argparse import Namespace
from typing import Any
from llama_index.core import Settings, StorageContext, VectorStoreIndex
from llama_index.core.node_parser import SentenceSplitter
from llama_index.embeddings.openai_like import OpenAILikeEmbedding
from llama_index.llms.openai_like import OpenAILike
from llama_index.readers.web import SimpleWebPageReader
from llama_index.vector_stores.milvus import MilvusVectorStore
def init_config(args: Namespace):
"""Initialize configuration with command line arguments"""
return {
"url": args.url,
"embedding_model": args.embedding_model,
"chat_model": args.chat_model,
"vllm_api_key": args.vllm_api_key,
"embedding_endpoint": args.embedding_endpoint,
"chat_endpoint": args.chat_endpoint,
"db_path": args.db_path,
"chunk_size": args.chunk_size,
"chunk_overlap": args.chunk_overlap,
"top_k": args.top_k,
}
def load_documents(url: str) -> list:
"""Load and process web documents"""
return SimpleWebPageReader(html_to_text=True).load_data([url])
def setup_models(config: dict[str, Any]):
"""Configure embedding and chat models"""
Settings.embed_model = OpenAILikeEmbedding(
api_base=config["embedding_endpoint"],
api_key=config["vllm_api_key"],
model_name=config["embedding_model"],
)
Settings.llm = OpenAILike(
model=config["chat_model"],
api_key=config["vllm_api_key"],
api_base=config["chat_endpoint"],
context_window=128000,
is_chat_model=True,
is_function_calling_model=False,
)
Settings.transformations = [
SentenceSplitter(
chunk_size=config["chunk_size"],
chunk_overlap=config["chunk_overlap"],
)
]
def setup_vector_store(db_path: str) -> MilvusVectorStore:
"""Initialize vector store"""
sample_emb = Settings.embed_model.get_text_embedding("test")
print(f"Embedding dimension: {len(sample_emb)}")
return MilvusVectorStore(uri=db_path, dim=len(sample_emb), overwrite=True)
def create_index(documents: list, vector_store: MilvusVectorStore):
"""Create document index"""
storage_context = StorageContext.from_defaults(vector_store=vector_store)
return VectorStoreIndex.from_documents(
documents,
storage_context=storage_context,
)
def query_document(index: VectorStoreIndex, question: str, top_k: int):
"""Query document with given question"""
query_engine = index.as_query_engine(similarity_top_k=top_k)
return query_engine.query(question)
def get_parser() -> argparse.ArgumentParser:
"""Parse command line arguments"""
parser = argparse.ArgumentParser(description="RAG with vLLM and LlamaIndex")
# Add command line arguments
parser.add_argument(
"--url",
default=("https://docs.vllm.com.cn/en/latest/getting_started/quickstart.html"),
help="URL of the document to process",
)
parser.add_argument(
"--embedding-model",
default="ssmits/Qwen2-7B-Instruct-embed-base",
help="Model name for embeddings",
)
parser.add_argument(
"--chat-model", default="qwen/Qwen1.5-0.5B-Chat", help="Model name for chat"
)
parser.add_argument(
"--vllm-api-key", default="EMPTY", help="API key for vLLM compatible services"
)
parser.add_argument(
"--embedding-endpoint",
default="http://localhost:8000/v1",
help="Base URL for embedding service",
)
parser.add_argument(
"--chat-endpoint",
default="http://localhost:8001/v1",
help="Base URL for chat service",
)
parser.add_argument(
"--db-path", default="./milvus_demo.db", help="Path to Milvus database"
)
parser.add_argument(
"-i", "--interactive", action="store_true", help="Enable interactive Q&A mode"
)
parser.add_argument(
"-c",
"--chunk-size",
type=int,
default=1000,
help="Chunk size for document splitting",
)
parser.add_argument(
"-o",
"--chunk-overlap",
type=int,
default=200,
help="Chunk overlap for document splitting",
)
parser.add_argument(
"-k", "--top-k", type=int, default=3, help="Number of top results to retrieve"
)
return parser
def main():
# Parse command line arguments
args = get_parser().parse_args()
# Initialize configuration
config = init_config(args)
# Load documents
documents = load_documents(config["url"])
# Setup models
setup_models(config)
# Setup vector store
vector_store = setup_vector_store(config["db_path"])
# Create index
index = create_index(documents, vector_store)
if args.interactive:
print("\nEntering interactive mode. Type 'quit' to exit.")
while True:
# Get user question
question = input("\nEnter your question: ")
# Check for exit command
if question.lower() in ["quit", "exit", "q"]:
print("Exiting interactive mode...")
break
# Get and print response
print("\n" + "-" * 50)
print("Response:\n")
response = query_document(index, question, config["top_k"])
print(response)
print("-" * 50)
else:
# Single query mode
question = "How to install vLLM?"
response = query_document(index, question, config["top_k"])
print("-" * 50)
print("Response:\n")
print(response)
print("-" * 50)
if __name__ == "__main__":
main()