From c2a4798be995bcd2203cb070e47d5db9ca510001 Mon Sep 17 00:00:00 2001 From: Minh Hoang Anh TRAN Date: Tue, 24 Feb 2026 14:21:12 +0100 Subject: [PATCH 1/2] ajout du multilingue --- .env.example | 214 ++++++++++++++++++++++++++++++++++++++++++----- README.md | 139 ++++++++++++++++++++++++++++++ rag/ingest.py | 26 ++++-- rag/query.py | 51 ++++++++--- requirements.txt | 1 + 5 files changed, 393 insertions(+), 38 deletions(-) diff --git a/.env.example b/.env.example index 82a7165..88339bf 100644 --- a/.env.example +++ b/.env.example @@ -1,20 +1,50 @@ # ============================================ # MODEL PROVIDER CONFIGURATION # ============================================ -# Choose your LLM provider: ollama, mistral, or openai MODEL_PROVIDER=ollama # -------------------------------------------- # OLLAMA SETTINGS (Local models: Qwen, Mistral, Llama, etc.) # -------------------------------------------- -# Current: Excellent balance of speed and quality (14B params, requires 16GB RAM) +# CURRENT: Mid-end configuration (16GB RAM, mid-end GPU/CPU) OLLAMA_MODEL=qwen2.5:14b-instruct OLLAMA_BASE_URL=http://localhost:11434 -# Alternative LLM models (run 'ollama pull ' first): -# OLLAMA_MODEL=qwen2.5:7b-instruct # Faster: 7B params, requires 8GB RAM, 2x faster -# OLLAMA_MODEL=qwen2.5:32b-instruct # Better: 32B params, requires 32GB RAM, higher quality -# OLLAMA_MODEL=qwen2.5:72b-instruct # Best: 72B params, requires 64GB RAM, maximum quality +# ============================================ +# HARDWARE-BASED MODEL RECOMMENDATIONS +# ============================================ +# Choose the configuration that matches your hardware: + +# --- [1] LOW-END CONFIG (8GB RAM, low-end GPU, low-end CPU) --- +# Best for: Budget laptops, older computers, testing +# OLLAMA_MODEL=qwen2.5:3b-instruct +# Alternative: qwen2.5:7b-instruct (if you can spare the RAM) +# Expected speed: 5-10 seconds/query (CPU), 1-3 seconds (GPU) +# RAM usage: ~4-6GB + +# --- [2] MID-END CONFIG (16GB RAM, mid-end GPU, mid-end CPU) --- ✅ CURRENT +# Best for: Modern laptops, standard workstations +# OLLAMA_MODEL=qwen2.5:14b-instruct +# Alternative: qwen2.5:7b-instruct (faster) +# Expected speed: 8-15 seconds/query (CPU), 2-4 seconds (GPU) +# RAM usage: ~10-14GB + +# --- [3] BETTER MID-END CONFIG (16GB RAM, modern GPU, modern CPU) --- +# Best for: Gaming PCs, modern workstations with RTX/RX GPUs +# OLLAMA_MODEL=qwen2.5:14b-instruct +# Alternative: qwen2.5:32b-instruct (if GPU has 12GB+ VRAM) +# Expected speed: 1-2 seconds/query (good GPU), 6-10 seconds (CPU) +# RAM usage: ~12-16GB +# Note: Modern GPU makes huge difference, can handle 32b with quantization + +# --- [4] HIGH-END CONFIG (32GB+ RAM, high-end GPU, high-end CPU) --- +# Best for: Workstations, servers, RTX 4090/A6000, ThreadRipper/Xeon +# OLLAMA_MODEL=qwen2.5:32b-instruct +# Alternatives: +# qwen2.5:72b-instruct (64GB+ RAM) +# qwen2.5:110b-instruct (80-128GB RAM, ultimate quality) +# Expected speed: 0.5-1.5 seconds/query (high-end GPU), 15-30 seconds (CPU) +# RAM usage: ~24-40GB (32b), ~50-80GB (72b), ~90-120GB (110b) # ============================================ # DOCUMENT PROCESSING @@ -55,17 +85,103 @@ CHUNK_OVERLAP=100 # 12.5% overlap prevents context loss at boundaries # ============================================ # EMBEDDING MODEL CONFIGURATION # ============================================ -# FastEmbed provider (local, fast, no Ollama dependency) -EMBEDDING_PROVIDER=fastembed -# --- EMBEDDING MODEL OPTIONS --- -# Current: FAST model for testing/development (384-dim, 5x faster) -EMBEDDING_MODEL=sentence-transformers/all-MiniLM-L6-v2 +# --- EMBEDDING PROVIDER SELECTION --- +# Choose: "fastembed" (faster, limited models) or "huggingface" (flexible, any model) +EMBEDDING_PROVIDER=huggingface + +# CURRENT: Mid-end configuration (multilingual) +EMBEDDING_MODEL=intfloat/multilingual-e5-large-instruct + +# ============================================ +# HOW TO SWITCH EMBEDDING PROVIDERS +# ============================================ +# +# OPTION 1: FastEmbed (faster, optimized, limited model support) +# EMBEDDING_PROVIDER=fastembed +# EMBEDDING_MODEL=BAAI/bge-m3 +# +# OPTION 2: HuggingFace (flexible, supports any model, slightly slower) +# EMBEDDING_PROVIDER=huggingface +# EMBEDDING_MODEL=intfloat/multilingual-e5-large-instruct +# +# ⚠️ After changing provider or model: run `python rag/ingest.py` + +# ============================================ +# HARDWARE-BASED EMBEDDING MODEL RECOMMENDATIONS +# ============================================ +# Choose the configuration that matches your hardware: + +# --- [1] LOW-END CONFIG (8GB RAM, low-end GPU, low-end CPU) --- +# Best for: Budget systems, fast ingestion, English-only or primary language focus +# EMBEDDING_PROVIDER=fastembed +# EMBEDDING_MODEL=sentence-transformers/all-MiniLM-L6-v2 +# Specs: 384-dim, 22M params, 5x faster than bge-m3 +# Quality: Good for English, poor for other languages +# Ingestion speed: ~500-800 docs/minute +# RAM usage: ~500MB +# Multilingual: ❌ English-only optimized +# Provider: ✅ FastEmbed supported +# +# Multilingual alternative for low-end: +# EMBEDDING_PROVIDER=fastembed # or huggingface +# EMBEDDING_MODEL=sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2 +# Specs: 384-dim, slower than all-MiniLM but supports 50+ languages +# Quality: Good for most languages +# Ingestion speed: ~300-500 docs/minute +# Provider: ✅ Both FastEmbed and HuggingFace + +# --- [2] MID-END CONFIG (16GB RAM, mid-end GPU, mid-end CPU) --- +# Best for: Multilingual document collections, good balance +# EMBEDDING_PROVIDER=fastembed +# EMBEDDING_MODEL=BAAI/bge-m3 +# Specs: 1024-dim, 568M params, excellent multilingual (100+ languages) +# Quality: Excellent for all languages +# Ingestion speed: ~150-250 docs/minute +# RAM usage: ~2GB +# Multilingual: ✅ Excellent (100+ languages) +# Provider: ✅ FastEmbed supported +# +# English-only alternative for mid-end (faster): +# EMBEDDING_PROVIDER=fastembed +# EMBEDDING_MODEL=BAAI/bge-base-en-v1.5 +# Specs: 768-dim, 2x faster than bge-m3, English-only +# Quality: Excellent for English +# Ingestion speed: ~250-400 docs/minute +# Provider: ✅ FastEmbed supported + +# --- [3] BETTER MID-END CONFIG (16GB RAM, modern GPU, modern CPU) --- ✅ CURRENT +# Best for: High-quality multilingual RAG, modern systems +# Option A: Use FastEmbed (faster) +# EMBEDDING_PROVIDER=fastembed +# EMBEDDING_MODEL=BAAI/bge-m3 +# +# Option B: Use HuggingFace for better complex query understanding +# EMBEDDING_PROVIDER=huggingface +# EMBEDDING_MODEL=intfloat/multilingual-e5-large-instruct +# Specs: 1024-dim, 560M params, instruction-tuned for RAG +# Quality: Excellent+ (better understanding of complex queries) +# Ingestion speed: ~120-200 docs/minute +# RAM usage: ~2.5GB +# Multilingual: ✅ Excellent (100+ languages) +# Note: E5-large-instruct handles technical jargon and complex questions better -# Production alternatives (uncomment to upgrade): -# EMBEDDING_MODEL=BAAI/bge-base-en-v1.5 # Balanced: 768-dim, 2x faster than large -# EMBEDDING_MODEL=BAAI/bge-large-en-v1.5 # Best Quality: 1024-dim, excellent retrieval -# EMBEDDING_MODEL=BAAI/bge-m3 # Multilingual: 1024-dim, 100+ languages +# --- [4] HIGH-END CONFIG (32GB+ RAM, high-end GPU, high-end CPU) --- +# Best for: Maximum quality, production systems, critical applications +# EMBEDDING_PROVIDER=huggingface +# EMBEDDING_MODEL=intfloat/multilingual-e5-large-instruct +# Specs: 1024-dim, 560M params, instruction-tuned, state-of-the-art RAG +# Quality: Best available for open-source multilingual embeddings +# Ingestion speed: ~100-180 docs/minute (with parallelization) +# RAM usage: ~2.5-3GB +# Multilingual: ✅ Excellent (100+ languages) +# Best for: Complex technical documents, research papers, precise retrieval +# Provider: ⚠️ HuggingFace only (not supported by FastEmbed) +# +# Alternative (if you need even larger context): +# EMBEDDING_PROVIDER=fastembed +# EMBEDDING_MODEL=BAAI/bge-large-en-v1.5 +# English-only but highest quality for English technical documents # ============================================ # RETRIEVAL & RERANKING CONFIGURATION @@ -74,10 +190,66 @@ RETRIEVAL_CHUNKS=100 # Initial chunks to retrieve (more = better recall, recomm TOP_N_RERANK=8 # Keep best N after reranking (recommended: 5-15) USE_RERANKING=true # Enable reranking for better relevance (highly recommended) -# --- RERANKER MODEL OPTIONS --- -# Current: Good balance of speed and quality (278M params) -RERANKER_MODEL=BAAI/bge-reranker-base +# CURRENT: Mid-end configuration (multilingual) +RERANKER_MODEL=BAAI/bge-reranker-v2-m3 -# Production alternatives (uncomment to upgrade): -# RERANKER_MODEL=BAAI/bge-reranker-large # Higher quality: 560M params, 2x slower -# RERANKER_MODEL=BAAI/bge-reranker-v2-m3 # State-of-the-art: 568M params, multilingual +# ============================================ +# HARDWARE-BASED RERANKER MODEL RECOMMENDATIONS +# ============================================ +# Choose the configuration that matches your hardware: + +# --- [1] LOW-END CONFIG (8GB RAM, low-end GPU, low-end CPU) --- +# Best for: Budget systems, fast queries +# Option A: Disable reranking for maximum speed +# USE_RERANKING=false +# TOP_N_RERANK=8 # Not used when reranking disabled +# Speed: Instant (no reranking overhead) +# +# Option B: Lightweight reranker (recommended if you can spare 2GB RAM) +# RERANKER_MODEL=BAAI/bge-reranker-base +# Specs: 278M params, English-focused but works for other languages +# Quality: Good +# Query overhead: +0.3-0.5 seconds +# RAM usage: ~1-1.5GB +# Multilingual: ⚠️ English-focused, acceptable for others + +# --- [2] MID-END CONFIG (16GB RAM, mid-end GPU, mid-end CPU) --- ✅ CURRENT +# Best for: Multilingual systems, good quality/speed balance +# RERANKER_MODEL=BAAI/bge-reranker-v2-m3 +# Specs: 568M params, state-of-the-art multilingual reranker +# Quality: Excellent for all languages +# Query overhead: +0.5-1 second +# RAM usage: ~2-3GB +# Multilingual: ✅ Excellent (100+ languages) +# Top N: 8-10 recommended +# +# English-only alternative (slightly faster): +# RERANKER_MODEL=BAAI/bge-reranker-large +# Specs: 560M params, English-only, similar speed to v2-m3 +# Quality: Excellent for English + +# --- [3] BETTER MID-END CONFIG (16GB RAM, modern GPU, modern CPU) --- +# Best for: Modern systems with good CPU, multilingual quality +# RERANKER_MODEL=BAAI/bge-reranker-v2-m3 +# Specs: Same as mid-end, but modern CPU handles it faster +# Quality: Excellent +# Query overhead: +0.3-0.6 seconds (faster CPU) +# RAM usage: ~2-3GB +# Top N: 10-12 recommended (can afford more reranking) +# Note: Good CPU makes reranking much faster, can increase TOP_N_RERANK + +# --- [4] HIGH-END CONFIG (32GB+ RAM, high-end GPU, high-end CPU) --- +# Best for: Maximum accuracy, production systems +# RERANKER_MODEL=BAAI/bge-reranker-v2-minicpm-layerwise +# Specs: 2.4B params, state-of-the-art quality, multilingual +# Quality: Best available (4x more parameters than v2-m3) +# Query overhead: +1.5-3 seconds (3-5x slower than v2-m3) +# RAM usage: ~8-16GB (just for reranker!) +# Multilingual: ✅ Excellent (100+ languages) +# Top N: 12-15 recommended +# Best for: Critical applications where accuracy > speed +# +# Alternative (balanced high-end): +# RERANKER_MODEL=BAAI/bge-reranker-v2-m3 +# Use current model but with higher TOP_N_RERANK=15 +# Faster queries while maintaining excellent quality diff --git a/README.md b/README.md index 388eb82..830b1d9 100644 --- a/README.md +++ b/README.md @@ -25,6 +25,11 @@ A Retrieval-Augmented Generation (RAG) system that lets you query your documents - [Upgrading to Production Quality](#upgrading-to-production-quality) - [🏆 Recommended Production Configurations](#-recommended-production-configurations) - [⚡ Performance Impact Summary](#-performance-impact-summary) +- [🌍 Multilingual Functionality Guide](#-multilingual-functionality-guide) + - [How Each Component Affects Multilingual Support](#how-each-component-affects-multilingual-support) + - [Current System Multilingual Capability](#current-system-multilingual-capability) + - [Upgrading to Full Multilingual Support](#upgrading-to-full-multilingual-support) + - [Testing Multilingual Functionality](#testing-multilingual-functionality) - [Chunking Configuration Guide](#chunking-configuration-guide) - [What is Chunking?](#what-is-chunking) - [Current Default Settings](#current-default-settings) @@ -423,6 +428,140 @@ OLLAMA_MODEL=qwen2.5:14b-instruct --- +## 🌍 Multilingual Functionality Guide + +The chatbot **automatically responds in the language you use** to ask questions (English, French, Spanish, etc.). However, **each model component affects multilingual quality differently**: + +### How Each Component Affects Multilingual Support + +#### **1. Embedding Model - CRITICAL for Multilingual Retrieval** 🔴 + +**Impact:** Determines if your question in ANY language can find relevant documents + +**Current Model:** `sentence-transformers/all-MiniLM-L6-v2` +- ⚠️ **English-only optimized** +- Non-English queries will retrieve less relevant documents +- Works for English, poor for French/Spanish/other languages + +**Recommended for Multilingual:** +```env +EMBEDDING_MODEL=BAAI/bge-m3 +# or +EMBEDDING_MODEL=sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2 +``` + +**Why it matters:** +- French question → English-focused embeddings → retrieves wrong documents → LLM gets irrelevant context → poor answer **even if LLM speaks French** +- Multilingual embeddings → retrieves correct documents in any language → LLM gets relevant context → excellent answer + +**⚠️ Requires re-ingestion:** YES - `python rag/ingest.py` + +--- + +#### **2. Reranker Model - Important for Multilingual Precision** 🟡 + +**Impact:** Refines which documents are most relevant to your question + +**Current Model:** `BAAI/bge-reranker-base` +- ⚠️ **English-focused** +- Can rerank, but less accurate for non-English queries + +**Recommended for Multilingual:** +```env +RERANKER_MODEL=BAAI/bge-reranker-v2-m3 +``` + +**Why it matters:** +- Even if embeddings retrieve 10 good multilingual documents, English-only reranker might rank them poorly +- Multilingual reranker correctly identifies the most relevant chunks in any language + +**⚠️ Requires re-ingestion:** NO - just update `.env` and restart + +--- + +#### **3. LLM (Text Generation Model) - Determines Answer Language** 🟢 + +**Impact:** Generates the actual response in the target language + +**Current Model:** `qwen2.5:14b-instruct` +- ✅ **Excellent multilingual support** (100+ languages) +- Strong in: English, Chinese, French, Spanish, German, Japanese, Korean, Arabic, and more +- The prompt automatically instructs it to respond in the question's language + +**Alternative Multilingual LLMs:** +```bash +ollama pull qwen2.5:32b-instruct # Best multilingual quality +ollama pull llama3.1:8b # Good for European languages +ollama pull mistral:7b-instruct # Good for French/English +``` + +**Why it matters:** +- Even with perfect retrieval, if LLM doesn't support the language, answers will be poor or in wrong language +- Qwen models are already excellent for multilingual - upgrading mainly improves reasoning depth + +**⚠️ Requires re-ingestion:** NO - just update `.env` and restart + +--- + +### Current System Multilingual Capability + +| Component | Current Model | Multilingual? | Impact on Non-English | +|-----------|---------------|---------------|------------------------| +| **Embedding** | all-MiniLM-L6-v2 | ❌ English-only | 🔴 **Poor retrieval** for non-English questions | +| **Reranker** | bge-reranker-base | ⚠️ English-focused | 🟡 **Suboptimal ranking** for non-English | +| **LLM** | qwen2.5:14b-instruct | ✅ Excellent | ✅ **Perfect responses** in any language | + +**Result:** The LLM **CAN respond** in French/Spanish/etc., but will work with **lower-quality context** retrieved by English-only embeddings. + +--- + +### Upgrading to Full Multilingual Support + +**Recommended Configuration:** + +```env +# In .env file +EMBEDDING_MODEL=BAAI/bge-m3 +RERANKER_MODEL=BAAI/bge-reranker-v2-m3 +OLLAMA_MODEL=qwen2.5:14b-instruct +``` + +**Steps:** +1. Update `.env` with multilingual models +2. Re-ingest documents: `python rag/ingest.py` (required for embedding change) +3. Restart frontend/queries + +**Benefits:** +- ✅ Excellent retrieval for questions in **any language** +- ✅ Accurate reranking regardless of language +- ✅ High-quality answers in **100+ languages** + +**Trade-offs:** +- Slightly slower (BGE-m3 is ~2x slower than all-MiniLM-L6-v2) +- Larger model downloads (~3GB vs 90MB) + +--- + +### Testing Multilingual Functionality + +```powershell +# English +python rag/query.py "What are the latest V-PCC compression results?" + +# French +python rag/query.py "Quels sont les derniers résultats de compression V-PCC ?" + +# Spanish +python rag/query.py "¿Cuáles son los últimos resultados de compresión V-PCC?" +``` + +**Expected behavior:** +- ✅ LLM responds in the correct language (works with current setup) +- ⚠️ Answer quality may be lower for non-English with current English-only embeddings +- ✅ Full quality in all languages after upgrading to multilingual embeddings + +--- + ## Chunking Configuration Guide ### What is Chunking? diff --git a/rag/ingest.py b/rag/ingest.py index 05d9a6e..bf580b4 100644 --- a/rag/ingest.py +++ b/rag/ingest.py @@ -21,6 +21,7 @@ from langchain_community.document_loaders import ( # type: ignore ) from langchain_text_splitters import RecursiveCharacterTextSplitter # type: ignore from langchain_community.embeddings import FastEmbedEmbeddings # type: ignore +from langchain_huggingface import HuggingFaceEmbeddings # type: ignore from langchain_chroma import Chroma # type: ignore from langchain_core.documents import Document # type: ignore from concurrent.futures import ThreadPoolExecutor, as_completed @@ -331,14 +332,27 @@ def main() -> None: chunks = splitter.split_documents(all_docs) print(f" Created {len(chunks)} chunks in {time.time() - start_split:.2f}s") + embedding_provider = os.getenv("EMBEDDING_PROVIDER", "fastembed").lower() embedding_model = os.getenv("EMBEDDING_MODEL") - print(f"\nUsing FastEmbed embeddings: {embedding_model}") - embeddings = FastEmbedEmbeddings( - model_name=embedding_model, - max_length=512, - threads=4 - ) + print(f"\nSetting up embeddings...") + print(f" Provider: {embedding_provider}") + print(f" Model: {embedding_model}") + + if embedding_provider == "huggingface": + print(" Using HuggingFace embeddings (supports any model)") + embeddings = HuggingFaceEmbeddings( + model_name=embedding_model, + model_kwargs={'device': 'cpu'}, + encode_kwargs={'normalize_embeddings': True} + ) + else: + print(" Using FastEmbed embeddings (optimized, limited models)") + embeddings = FastEmbedEmbeddings( + model_name=embedding_model, + max_length=512, + threads=4 + ) chroma_dir.mkdir(parents=True, exist_ok=True) print(f"\nBuilding Chroma index at: {chroma_dir}") diff --git a/rag/query.py b/rag/query.py index dcef13e..9002067 100644 --- a/rag/query.py +++ b/rag/query.py @@ -6,6 +6,7 @@ from typing import List from dotenv import load_dotenv from langchain_ollama import ChatOllama # type: ignore from langchain_community.embeddings import FastEmbedEmbeddings # type: ignore +from langchain_huggingface import HuggingFaceEmbeddings # type: ignore from langchain_chroma import Chroma # type: ignore from langchain_core.documents import Document # type: ignore from langchain_core.prompts import ChatPromptTemplate # type: ignore @@ -37,12 +38,31 @@ def format_docs(docs: List[Document]) -> str: return "\n\n".join(f"[Source: {d.metadata.get('source', 'unknown')}]\n{d.page_content}" for d in docs) +def get_embeddings(): + """Get embeddings based on the configured provider.""" + embedding_provider = os.getenv("EMBEDDING_PROVIDER", "fastembed").lower() + embedding_model = os.getenv("EMBEDDING_MODEL") + + if embedding_provider == "huggingface": + return HuggingFaceEmbeddings( + model_name=embedding_model, + model_kwargs={'device': 'cpu'}, + encode_kwargs={'normalize_embeddings': True} + ) + else: + return FastEmbedEmbeddings( + model_name=embedding_model, + max_length=512 + ) + + def get_llm(): model_name = os.getenv("OLLAMA_MODEL") base_url = os.getenv("OLLAMA_BASE_URL") return ChatOllama(model=model_name, base_url=base_url), model_name + def run_query_complete(query: str, provider: str = "ollama") -> tuple[str, str, list[str]]: """ Run a complete query and return (answer, model_name, sources). @@ -55,11 +75,7 @@ def run_query_complete(query: str, provider: str = "ollama") -> tuple[str, str, if not chroma_dir.exists(): raise FileNotFoundError(f"Vector store directory not found: {chroma_dir}. Run ingestion first.") - embedding_model = os.getenv("EMBEDDING_MODEL") - embeddings = FastEmbedEmbeddings( - model_name=embedding_model, - max_length=512 - ) + embeddings = get_embeddings() vectorstore = Chroma(persist_directory=str(chroma_dir), embedding_function=embeddings) @@ -75,10 +91,18 @@ def run_query_complete(query: str, provider: str = "ollama") -> tuple[str, str, compressor = Reranker(model_name=reranker_model, top_n=top_n) docs = compressor.compress_documents(docs, query) - # Use the EXACT same prompt as the terminal version for consistent quality prompt = ChatPromptTemplate.from_messages([ ("system", """You are an expert technical assistant that provides extremely detailed, comprehensive, and in-depth answers based on the given context. +🌍 LANGUAGE REQUIREMENT - CRITICAL: +- ALWAYS respond in the SAME LANGUAGE as the question +- If the question is in French, respond in French +- If the question is in English, respond in English +- If the question is in Spanish, respond in Spanish +- Apply this to ANY language the user asks in +- Maintain the SAME level of technical detail and quality regardless of language +- Technical terms can remain in English if there's no standard translation, but explain them in the question's language + CRITICAL INSTRUCTIONS - YOUR ANSWERS MUST BE DETAILED AND THOROUGH: 📝 LENGTH & DEPTH REQUIREMENTS: @@ -182,11 +206,7 @@ def main() -> None: print(f"Loading Chroma from: {chroma_dir}") start_time = time.time() - embedding_model = os.getenv("EMBEDDING_MODEL") - embeddings = FastEmbedEmbeddings( - model_name=embedding_model, - max_length=512 - ) + embeddings = get_embeddings() vectorstore = Chroma(persist_directory=str(chroma_dir), embedding_function=embeddings) print(f"Loaded in {time.time() - start_time:.2f}s") @@ -210,6 +230,15 @@ def main() -> None: prompt = ChatPromptTemplate.from_messages([ ("system", """You are an expert technical assistant that provides extremely detailed, comprehensive, and in-depth answers based on the given context. +🌍 LANGUAGE REQUIREMENT - CRITICAL: +- ALWAYS respond in the SAME LANGUAGE as the question +- If the question is in French, respond in French +- If the question is in English, respond in English +- If the question is in Spanish, respond in Spanish +- Apply this to ANY language the user asks in +- Maintain the SAME level of technical detail and quality regardless of language +- Technical terms can remain in English if there's no standard translation, but explain them in the question's language + CRITICAL INSTRUCTIONS - YOUR ANSWERS MUST BE DETAILED AND THOROUGH: 📝 LENGTH & DEPTH REQUIREMENTS: diff --git a/requirements.txt b/requirements.txt index 146b9fa..1a696bd 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,6 +2,7 @@ langchain>=1.0.8 langchain-community>=0.4.1 langchain-ollama>=1.0.0 langchain-chroma>=0.1.2 +langchain-huggingface>=0.1.0 chromadb>=1.3.5 fastembed>=0.7.3 python-dotenv>=1.2.1 -- GitLab From 1b9d6894fe07a56c16af78ecfb3074cef590a37c Mon Sep 17 00:00:00 2001 From: Nozomu05 Date: Wed, 25 Feb 2026 17:35:15 +0100 Subject: [PATCH 2/2] ajout du multilingue avec l'ajout de QWEN sans utilisation de OLLAMA --- .env.example | 183 ++++++++++++++++++------- .gitignore | 10 +- README.md | 292 ++++++++++++++++++++++++---------------- frontend/app.py | 3 +- rag/ingest.py | 26 ++-- rag/query.py | 89 ++++++++++-- rag/transformers_llm.py | 280 ++++++++++++++++++++++++++++++++++++++ requirements.txt | 7 +- 8 files changed, 695 insertions(+), 195 deletions(-) create mode 100644 rag/transformers_llm.py diff --git a/.env.example b/.env.example index 88339bf..a712526 100644 --- a/.env.example +++ b/.env.example @@ -1,14 +1,86 @@ # ============================================ # MODEL PROVIDER CONFIGURATION # ============================================ -MODEL_PROVIDER=ollama +LLM_PROVIDER=transformers # -------------------------------------------- -# OLLAMA SETTINGS (Local models: Qwen, Mistral, Llama, etc.) +# TRANSFORMERS SETTINGS (HuggingFace models) # -------------------------------------------- -# CURRENT: Mid-end configuration (16GB RAM, mid-end GPU/CPU) -OLLAMA_MODEL=qwen2.5:14b-instruct -OLLAMA_BASE_URL=http://localhost:11434 +TRANSFORMERS_MODEL=Qwen/Qwen2.5-7B-Instruct +MAX_NEW_TOKENS=2048 +TEMPERATURE=0.7 +# Model will download to ~/.cache/huggingface/ (about 28GB for 14B model) +# QUANTIZATION: Reduces memory usage (important for GPUs with limited VRAM) +# 4bit - Use 4-bit quantization (~3.5GB VRAM for 14B model, recommended for GPUs with <12GB VRAM) +# 8bit - Use 8-bit quantization (~7GB VRAM for 14B model) +# none - No quantization (~14GB VRAM for 14B model, requires high-end GPU) +QUANTIZATION=4bit +# MAX_NEW_TOKENS: Increase for longer, more complete answers. 2048 allows detailed responses. + +# -------------------------------------------- +# DEVICE CONFIGURATION +# -------------------------------------------- +# Control which device each component uses +# Options: +# auto - Auto-detect GPU and use it if available (recommended for LLM) +# cuda - Force GPU usage (fastest, requires NVIDIA GPU with CUDA) +# cpu - Force CPU usage (slower but works on any computer) +# +# Components: +LLM_DEVICE=auto # Qwen language model (main inference) +EMBEDDING_DEVICE=cuda # Document/query embeddings (10-50x faster on GPU) +RERANKER_DEVICE=cuda # Re-ranking model (improves result quality) +# +# Recommendations: +# - With GPU: Use cuda/auto for all (best performance) +# - CPU only: Set all to cpu +# - Limited GPU memory: LLM=cpu, EMBEDDING=cuda, RERANKER=cpu + +# ============================================ +# HARDWARE-BASED MODEL RECOMMENDATIONS +# ============================================ +# Choose the configuration that matches your hardware: + +# Copy of .env for reference and quick setup; edit as needed. + +# ============================================ +# MODEL PROVIDER CONFIGURATION +# ============================================ +LLM_PROVIDER=transformers + +# -------------------------------------------- +# TRANSFORMERS SETTINGS (HuggingFace models) +# -------------------------------------------- +TRANSFORMERS_MODEL=Qwen/Qwen2.5-14B-Instruct +MAX_NEW_TOKENS=4096 +TEMPERATURE=0 +LLM_SEED=42 +# Model will download to ~/.cache/huggingface/ (about 28GB for 14B model) +# QUANTIZATION: Reduces memory usage (important for GPUs with limited VRAM) +# 4bit - Use 4-bit quantization (~3.5GB VRAM for 14B model, recommended for GPUs with <12GB VRAM) +# 8bit - Use 8-bit quantization (~7GB VRAM for 14B model) +# none - No quantization (~14GB VRAM for 14B model, requires high-end GPU) +QUANTIZATION=4bit +# MAX_NEW_TOKENS: Increase for longer, more complete answers. 4096 allows longer responses. + +# -------------------------------------------- +# DEVICE CONFIGURATION +# -------------------------------------------- +# Control which device each component uses +# Options: +# auto - Auto-detect GPU and use it if available (recommended for LLM) +# cuda - Force GPU usage (fastest, requires NVIDIA GPU with CUDA) +# cpu - Force CPU usage (slower but works on any computer) +# +# Components: +LLM_DEVICE=auto # Qwen language model (main inference) +EMBEDDING_DEVICE=cuda # Document/query embeddings (10-50x faster on GPU) +RERANKER_DEVICE=cuda # Re-ranking model (improves result quality) +# +# Recommendations: +# - With GPU: Use cuda/auto for all (best performance) +# - CPU only: Set all to cpu +# - Limited GPU memory: LLM=cpu, EMBEDDING=cuda, RERANKER=cpu # ============================================ # HARDWARE-BASED MODEL RECOMMENDATIONS @@ -17,34 +89,33 @@ OLLAMA_BASE_URL=http://localhost:11434 # --- [1] LOW-END CONFIG (8GB RAM, low-end GPU, low-end CPU) --- # Best for: Budget laptops, older computers, testing -# OLLAMA_MODEL=qwen2.5:3b-instruct -# Alternative: qwen2.5:7b-instruct (if you can spare the RAM) +# TRANSFORMERS_MODEL=Qwen/Qwen2.5-3B-Instruct +# Alternative: Qwen/Qwen2.5-7B-Instruct (if you can spare the RAM) # Expected speed: 5-10 seconds/query (CPU), 1-3 seconds (GPU) # RAM usage: ~4-6GB # --- [2] MID-END CONFIG (16GB RAM, mid-end GPU, mid-end CPU) --- ✅ CURRENT # Best for: Modern laptops, standard workstations -# OLLAMA_MODEL=qwen2.5:14b-instruct -# Alternative: qwen2.5:7b-instruct (faster) +# TRANSFORMERS_MODEL=Qwen/Qwen2.5-14B-Instruct +# Alternative: Qwen/Qwen2.5-7B-Instruct (faster) # Expected speed: 8-15 seconds/query (CPU), 2-4 seconds (GPU) # RAM usage: ~10-14GB # --- [3] BETTER MID-END CONFIG (16GB RAM, modern GPU, modern CPU) --- # Best for: Gaming PCs, modern workstations with RTX/RX GPUs -# OLLAMA_MODEL=qwen2.5:14b-instruct -# Alternative: qwen2.5:32b-instruct (if GPU has 12GB+ VRAM) +# TRANSFORMERS_MODEL=Qwen/Qwen2.5-14B-Instruct +# Alternative: Qwen/Qwen2.5-32B-Instruct (if GPU has 12GB+ VRAM) # Expected speed: 1-2 seconds/query (good GPU), 6-10 seconds (CPU) # RAM usage: ~12-16GB -# Note: Modern GPU makes huge difference, can handle 32b with quantization +# Note: Modern GPU makes huge difference, can handle 32B models # --- [4] HIGH-END CONFIG (32GB+ RAM, high-end GPU, high-end CPU) --- # Best for: Workstations, servers, RTX 4090/A6000, ThreadRipper/Xeon -# OLLAMA_MODEL=qwen2.5:32b-instruct +# TRANSFORMERS_MODEL=Qwen/Qwen2.5-32B-Instruct # Alternatives: -# qwen2.5:72b-instruct (64GB+ RAM) -# qwen2.5:110b-instruct (80-128GB RAM, ultimate quality) +# Qwen/Qwen2.5-72B-Instruct (64GB+ RAM) # Expected speed: 0.5-1.5 seconds/query (high-end GPU), 15-30 seconds (CPU) -# RAM usage: ~24-40GB (32b), ~50-80GB (72b), ~90-120GB (110b) +# RAM usage: ~24-40GB (32B), ~50-80GB (72B) # ============================================ # DOCUMENT PROCESSING @@ -93,6 +164,50 @@ EMBEDDING_PROVIDER=huggingface # CURRENT: Mid-end configuration (multilingual) EMBEDDING_MODEL=intfloat/multilingual-e5-large-instruct +# ============================================ +# HOW TO SWITCH EMBEDDING PROVIDERS +# ============================================ +# +# OPTION 1: FastEmbed (faster, optimized, limited model support) +*** End Patch +# --- CHUNKING CONFIGURATION --- +# How documents are split affects answer quality! +# Chunk size = characters per chunk | Overlap = shared chars between adjacent chunks + +# CURRENT: General technical documents (good balance) +CHUNK_SIZE=800 # ~150-200 words, 2-3 paragraphs +CHUNK_OVERLAP=100 # 12.5% overlap prevents context loss at boundaries + +# ADJUST BASED ON YOUR DOCUMENT TYPE: +# Dense technical specs/standards (MPEG, ISO, etc.): +# CHUNK_SIZE=1000-1200, CHUNK_OVERLAP=150-200 +# → Preserves complete technical descriptions, tables, multi-paragraph explanations +# +# Short Q&A, FAQs, snippets: +# CHUNK_SIZE=500-600, CHUNK_OVERLAP=75-100 +# → More precision, faster retrieval, good for focused answers +# +# Long-form articles, research papers: +# CHUNK_SIZE=1500-2000, CHUNK_OVERLAP=300-400 +# → Preserves narrative flow, complete arguments, methodology sections +# +# Mixed document collection: +# CHUNK_SIZE=800-1000, CHUNK_OVERLAP=120-150 +# → Balanced for various content types +# +# NOTE: Changing these requires re-ingestion: python rag/ingest.py + +# ============================================ +# EMBEDDING MODEL CONFIGURATION +# ============================================ + +# --- EMBEDDING PROVIDER SELECTION --- +# Choose: "fastembed" (faster, limited models) or "huggingface" (flexible, any model) +EMBEDDING_PROVIDER=huggingface + +# CURRENT: Mid-end configuration (multilingual) +EMBEDDING_MODEL=intfloat/multilingual-e5-large-instruct + # ============================================ # HOW TO SWITCH EMBEDDING PROVIDERS # ============================================ @@ -130,7 +245,7 @@ EMBEDDING_MODEL=intfloat/multilingual-e5-large-instruct # Quality: Good for most languages # Ingestion speed: ~300-500 docs/minute # Provider: ✅ Both FastEmbed and HuggingFace - +# # --- [2] MID-END CONFIG (16GB RAM, mid-end GPU, mid-end CPU) --- # Best for: Multilingual document collections, good balance # EMBEDDING_PROVIDER=fastembed @@ -149,7 +264,7 @@ EMBEDDING_MODEL=intfloat/multilingual-e5-large-instruct # Quality: Excellent for English # Ingestion speed: ~250-400 docs/minute # Provider: ✅ FastEmbed supported - +# # --- [3] BETTER MID-END CONFIG (16GB RAM, modern GPU, modern CPU) --- ✅ CURRENT # Best for: High-quality multilingual RAG, modern systems # Option A: Use FastEmbed (faster) @@ -165,7 +280,7 @@ EMBEDDING_MODEL=intfloat/multilingual-e5-large-instruct # RAM usage: ~2.5GB # Multilingual: ✅ Excellent (100+ languages) # Note: E5-large-instruct handles technical jargon and complex questions better - +# # --- [4] HIGH-END CONFIG (32GB+ RAM, high-end GPU, high-end CPU) --- # Best for: Maximum quality, production systems, critical applications # EMBEDDING_PROVIDER=huggingface @@ -182,7 +297,7 @@ EMBEDDING_MODEL=intfloat/multilingual-e5-large-instruct # EMBEDDING_PROVIDER=fastembed # EMBEDDING_MODEL=BAAI/bge-large-en-v1.5 # English-only but highest quality for English technical documents - +# # ============================================ # RETRIEVAL & RERANKING CONFIGURATION # ============================================ @@ -212,7 +327,7 @@ RERANKER_MODEL=BAAI/bge-reranker-v2-m3 # Query overhead: +0.3-0.5 seconds # RAM usage: ~1-1.5GB # Multilingual: ⚠️ English-focused, acceptable for others - +# # --- [2] MID-END CONFIG (16GB RAM, mid-end GPU, mid-end CPU) --- ✅ CURRENT # Best for: Multilingual systems, good quality/speed balance # RERANKER_MODEL=BAAI/bge-reranker-v2-m3 @@ -227,29 +342,5 @@ RERANKER_MODEL=BAAI/bge-reranker-v2-m3 # RERANKER_MODEL=BAAI/bge-reranker-large # Specs: 560M params, English-only, similar speed to v2-m3 # Quality: Excellent for English - -# --- [3] BETTER MID-END CONFIG (16GB RAM, modern GPU, modern CPU) --- -# Best for: Modern systems with good CPU, multilingual quality -# RERANKER_MODEL=BAAI/bge-reranker-v2-m3 -# Specs: Same as mid-end, but modern CPU handles it faster -# Quality: Excellent -# Query overhead: +0.3-0.6 seconds (faster CPU) -# RAM usage: ~2-3GB -# Top N: 10-12 recommended (can afford more reranking) -# Note: Good CPU makes reranking much faster, can increase TOP_N_RERANK - -# --- [4] HIGH-END CONFIG (32GB+ RAM, high-end GPU, high-end CPU) --- -# Best for: Maximum accuracy, production systems -# RERANKER_MODEL=BAAI/bge-reranker-v2-minicpm-layerwise -# Specs: 2.4B params, state-of-the-art quality, multilingual -# Quality: Best available (4x more parameters than v2-m3) -# Query overhead: +1.5-3 seconds (3-5x slower than v2-m3) -# RAM usage: ~8-16GB (just for reranker!) -# Multilingual: ✅ Excellent (100+ languages) -# Top N: 12-15 recommended -# Best for: Critical applications where accuracy > speed # -# Alternative (balanced high-end): -# RERANKER_MODEL=BAAI/bge-reranker-v2-m3 -# Use current model but with higher TOP_N_RERANK=15 -# Faster queries while maintaining excellent quality +``` diff --git a/.gitignore b/.gitignore index e337265..b077bd0 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,7 @@ # Python .venv/ venv/ +QWEN/ __pycache__/ *.py[cod] *$py.class @@ -39,13 +40,18 @@ wheels/ Thumbs.db # Project specific -storage/chroma/ -docs/ + storage/.ingest_cache.json ingestion_errors.log # Office temporary files ~$* +# Ignore everything inside docs/ but not the folder itself +docs/* +!docs/ +# Ignore everything inside storage/ but not the folder itself +storage/* +!storage/ *.tmp diff --git a/README.md b/README.md index 830b1d9..3837d1b 100644 --- a/README.md +++ b/README.md @@ -1,22 +1,22 @@ # RAG System with Qwen -A Retrieval-Augmented Generation (RAG) system that lets you query your documents using Ollama and Qwen models locally. +A Retrieval-Augmented Generation (RAG) system that lets you query your documents using Qwen models from HuggingFace Transformers locally. --- ## Table of Contents - [Installation and Setup](#installation-and-setup) - - [Step 1: Install Ollama](#step-1-install-ollama) - - [Step 2: Pull Required Ollama Models](#step-2-pull-required-ollama-models) - - [Step 3: Enable CPU-Only Mode (For Low-End Computers)](#step-3-enable-cpu-only-mode-for-low-end-computers) - - [Step 4: Install Pandoc (Optional)](#step-4-install-pandoc-optional) - - [Step 5: Setup Python Environment](#step-5-setup-python-environment) - - [Step 6: Configure Environment](#step-6-configure-environment) - - [Step 7: Add Your Documents](#step-7-add-your-documents) - - [Step 8: Ingest Documents](#step-8-ingest-documents) - - [Step 9: Start the Frontend](#step-9-start-the-frontend) + - [Linux Prerequisites](#linux-prerequisites) + - [Step 1: Setup Python Environment](#step-1-setup-python-environment) + - [Step 2: Install Pandoc (Optional)](#step-2-install-pandoc-optional) + - [Step 3: Configure Environment](#step-3-configure-environment) + - [Step 4: Add Your Documents](#step-4-add-your-documents) + - [Step 5: Ingest Documents](#step-5-ingest-documents) + - [Step 6: Start the Frontend](#step-6-start-the-frontend) - [Command-Line Query (Optional)](#command-line-query-optional) + - [Interactive Mode (Recommended)](#interactive-mode-recommended) + - [Single Query Mode](#single-query-mode) - [Performance Notes](#performance-notes) - [CPU vs GPU Mode](#cpu-vs-gpu-mode) - [Model Recommendations by Hardware](#model-recommendations-by-hardware) @@ -43,86 +43,72 @@ A Retrieval-Augmented Generation (RAG) system that lets you query your documents ## Installation and Setup -### Step 1: Install Ollama +### Linux Prerequisites -**Windows (via Winget):** -```powershell -winget install Ollama.Ollama -e -``` +**For Ubuntu/Debian-based distributions:** +```bash +# Update package list +sudo apt update -Verify installation: -```powershell -ollama --version -``` +# Install Python 3.10+ and pip +sudo apt install python3 python3-pip python3-venv -Ollama runs as a Windows service automatically. If not running: -```powershell -ollama serve +# Install development tools (required for some Python packages) +sudo apt install build-essential python3-dev ``` -**macOS (via Homebrew):** +**For Fedora/RHEL/CentOS:** ```bash -brew install ollama -``` +# Install Python 3.10+ and pip +sudo dnf install python3 python3-pip -Verify installation: -```bash -ollama --version +# Install development tools +sudo dnf groupinstall "Development Tools" +sudo dnf install python3-devel ``` -Start Ollama service: +**For Arch Linux:** ```bash -ollama serve -``` - -**macOS (Manual Download):** -Download from [https://ollama.ai/download](https://ollama.ai/download) and install the .dmg file. +# Install Python and pip +sudo pacman -S python python-pip -### Step 2: Pull Required Ollama Models - -**LLM Model (for answering queries):** -```bash -ollama pull qwen2.5:14b-instruct +# Install base development tools +sudo pacman -S base-devel ``` -**Embedding Model (for semantic search):** +**Verify Python installation:** ```bash -ollama pull mxbai-embed-large +python3 --version # Should be 3.10 or higher +pip3 --version ``` -**Note for low-end computers:** The 14b model requires ~16GB RAM. If you have less RAM, use: -```bash -ollama pull qwen2.5:7b-instruct # Requires ~8GB RAM -``` - -### Step 3: Enable CPU-Only Mode (For Low-End Computers) - -**If you have a low-end computer or insufficient GPU memory**, force Ollama to run on CPU only: +### Step 1: Setup Python Environment **Windows:** ```powershell -[System.Environment]::SetEnvironmentVariable('OLLAMA_NUM_GPU', '0', 'User') -$env:OLLAMA_NUM_GPU = '0' +python -m venv .venv +.\.venv\Scripts\Activate.ps1 +``` + +**Note:** If you get an execution policy error: +```powershell +Set-ExecutionPolicy -ExecutionPolicy RemoteSigned -Scope CurrentUser ``` **macOS/Linux:** ```bash -echo 'export OLLAMA_NUM_GPU=0' >> ~/.bashrc # or ~/.zshrc for zsh -source ~/.bashrc # or source ~/.zshrc +python3 -m venv .venv +source .venv/bin/activate ``` -Restart your terminal after setting this. The model will run slower but work on any computer. - -**To re-enable GPU later (if you upgrade hardware):** -```powershell -# Windows -[System.Environment]::SetEnvironmentVariable('OLLAMA_NUM_GPU', '1', 'User') -``` +**Install dependencies (all platforms):** ```bash -# macOS/Linux - remove the line from ~/.bashrc or ~/.zshrc +pip install -r requirements.txt ``` -### Step 4: Install Pandoc (Optional) +**Note:** The first time you run a query, the Qwen model (~28GB for 14B model) will download automatically to `~/.cache/huggingface/`. This may take some time depending on your internet connection. + +### Step 2: Install Pandoc (Optional) Only needed if you have OpenDocument (.odt) files: @@ -136,31 +122,20 @@ winget install --id JohnMacFarlane.Pandoc -e brew install pandoc ``` -### Step 5: Setup Python Environment - -**Windows:** -```powershell -python -m venv .venv -.\.venv\Scripts\Activate.ps1 -``` - -**Note:** If you get an execution policy error: -```powershell -Set-ExecutionPolicy -ExecutionPolicy RemoteSigned -Scope CurrentUser -``` - -**macOS/Linux:** +**Linux:** ```bash -python3 -m venv .venv -source .venv/bin/activate -``` +# Ubuntu/Debian +sudo apt update +sudo apt install pandoc -**Install dependencies (all platforms):** -```bash -pip install -r requirements.txt +# Fedora/RHEL/CentOS +sudo dnf install pandoc + +# Arch Linux +sudo pacman -S pandoc ``` -### Step 6: Configure Environment +### Step 3: Configure Environment **Windows:** ```powershell @@ -175,27 +150,39 @@ cp .env.example .env Edit `.env` with your settings: ```env # Model Configuration -OLLAMA_MODEL=qwen2.5:14b-instruct -OLLAMA_BASE_URL=http://localhost:11434 -EMBEDDING_MODEL=mxbai-embed-large +LLM_PROVIDER=transformers +TRANSFORMERS_MODEL=Qwen/Qwen2.5-14B-Instruct +MAX_NEW_TOKENS=4096 +TEMPERATURE=0 +LLM_SEED=42 +QUANTIZATION=4bit + +# Device Configuration (auto, cuda, or cpu) +LLM_DEVICE=auto +EMBEDDING_DEVICE=cuda +RERANKER_DEVICE=cuda + +# Embedding Configuration +EMBEDDING_PROVIDER=huggingface +EMBEDDING_MODEL=intfloat/multilingual-e5-large-instruct # Retrieval Settings RETRIEVAL_CHUNKS=100 -TOP_N_RERANK=15 +TOP_N_RERANK=8 USE_RERANKING=true # Document Processing CHUNK_SIZE=800 -CHUNK_OVERLAP=160 +CHUNK_OVERLAP=100 ``` -**Note:** If using 7b model on low-end computer, change to `OLLAMA_MODEL=qwen2.5:7b-instruct` +**Note:** If using a lower-spec computer, change to `TRANSFORMERS_MODEL=Qwen/Qwen2.5-7B-Instruct` for faster performance. If you don't have a GPU, set all device settings to `cpu`. -### Step 7: Add Your Documents +### Step 4: Add Your Documents Place your documents (Word, PDF, PowerPoint, Text, Markdown, etc.) in the `docs/` folder. -### Step 8: Ingest Documents +### Step 5: Ingest Documents Run the ingestion script to process your documents: @@ -215,7 +202,7 @@ This will: - Generate embeddings - Store vectors in the database -### Step 9: Start the Frontend +### Step 6: Start the Frontend Start the web interface: @@ -237,25 +224,99 @@ Open this URL in your browser to start querying your documents! ## Command-Line Query (Optional) -You can also run queries directly from the command line: +### Interactive Mode (Recommended) + +For multiple queries without reloading the model each time: + +**macOS/Linux:** +```bash +python rag/query_interactive.py +``` **Windows:** ```powershell -python rag\query.py "Your question here" +python rag\query_interactive.py +``` + +This loads the model **once** and keeps it in memory. You can then ask multiple questions without the 15-second checkpoint loading delay. + +**Example session:** ``` +Query: What is V-PCC? +[Answer streams in real-time...] + +Query: How does it compare to G-PCC? +[Answer streams immediately - no reload!] + +Query: quit +``` + +### Single Query Mode + +For one-off queries from the command line: **macOS/Linux:** ```bash python rag/query.py "Your question here" ``` +**Windows:** +```powershell +python rag\query.py "Your question here" +``` + +**Note:** This reloads the model each time (~15s startup) + --- ## Performance Notes ### CPU vs GPU Mode -- **GPU Mode (default):** Fast responses (1-2 seconds with 14b model) -- **CPU-Only Mode:** Slower responses (8-15 seconds with 14b model) but works on any computer + +The system can run on either CPU or GPU for optimal performance. You can configure which device each component uses in your `.env` file: + +```env +# Device configuration +# Options: auto (auto-detect GPU), cuda (force GPU), cpu (force CPU) +LLM_DEVICE=auto # Qwen language model +EMBEDDING_DEVICE=cuda # Document/query embeddings +RERANKER_DEVICE=cuda # Re-ranking model +``` + +**Device Options:** +- `auto` - Automatically detects and uses GPU if available (recommended for LLM) +- `cuda` - Forces GPU usage (fastest, requires NVIDIA GPU with CUDA) +- `cpu` - Forces CPU usage (slower but works on any computer) + +**Performance Comparison (14B model):** +- **GPU Mode (cuda):** Fast responses (1-2 seconds) +- **CPU-Only Mode (cpu):** Slower responses (8-15 seconds) but works on any computer +- **Auto Mode (auto):** Best of both worlds - uses GPU if available, falls back to CPU + +**Recommended Configurations:** + +*For systems with NVIDIA GPU:* +```env +LLM_DEVICE=auto # Use GPU if available +EMBEDDING_DEVICE=cuda # Embeddings are 10-50x faster on GPU +RERANKER_DEVICE=cuda # Re-ranking is faster on GPU +``` + +*For CPU-only systems (no GPU):* +```env +LLM_DEVICE=cpu +EMBEDDING_DEVICE=cpu +RERANKER_DEVICE=cpu +``` + +*For systems with limited GPU memory:* +```env +LLM_DEVICE=cpu # Save GPU memory +EMBEDDING_DEVICE=cuda # Embeddings use less memory +RERANKER_DEVICE=cpu # Only when needed +``` + +**Note:** After changing device settings, restart the application for changes to take effect. Re-ingestion is not required unless you change `EMBEDDING_DEVICE` after already ingesting documents. ### Model Recommendations by Hardware @@ -349,26 +410,23 @@ No re-ingestion needed, changes apply immediately! - Use case: Complex reasoning, technical documents - Requirements: 32GB+ RAM recommended -**Option 2 - Maximum Quality:** `qwen2.5:72b-instruct` (72B params, 48GB VRAM/64GB RAM) +**Option 2 - Maximum Quality:** `Qwen/Qwen2.5-72B-Instruct` (72B params, 48GB VRAM/64GB RAM) - Speed: ⚡⚡ Slow (5x slower) - Quality: ⭐⭐⭐⭐⭐ Best available - Use case: Research, critical analysis, highest accuracy - Requirements: 64GB+ RAM, powerful hardware -**Option 3 - Faster Lightweight:** `qwen2.5:7b-instruct` (7B params, 4GB VRAM/8GB RAM) +**Option 3 - Faster Lightweight:** `Qwen/Qwen2.5-7B-Instruct` (7B params, 4GB VRAM/8GB RAM) - Speed: ⚡⚡⚡⚡⚡ Very Fast (2x faster) - Quality: ⭐⭐⭐ Good - Use case: Low-end hardware, quick responses **To upgrade LLM:** -```bash -# Pull new model -ollama pull qwen2.5:32b-instruct - +```env # Update .env -OLLAMA_MODEL=qwen2.5:32b-instruct +TRANSFORMERS_MODEL=Qwen/Qwen2.5-32B-Instruct ``` -No re-ingestion needed! +The new model will download automatically on first use. No re-ingestion needed! ### 🏆 **Recommended Production Configurations** @@ -376,7 +434,7 @@ No re-ingestion needed! ```env EMBEDDING_MODEL=BAAI/bge-base-en-v1.5 RERANKER_MODEL=BAAI/bge-reranker-base -OLLAMA_MODEL=qwen2.5:14b-instruct +TRANSFORMERS_MODEL=Qwen/Qwen2.5-14B-Instruct ``` - **Speed:** Fast - **Quality:** Very Good @@ -387,7 +445,7 @@ OLLAMA_MODEL=qwen2.5:14b-instruct ```env EMBEDDING_MODEL=BAAI/bge-large-en-v1.5 RERANKER_MODEL=BAAI/bge-reranker-v2-m3 -OLLAMA_MODEL=qwen2.5:32b-instruct +TRANSFORMERS_MODEL=Qwen/Qwen2.5-32B-Instruct ``` - **Speed:** Moderate - **Quality:** Excellent @@ -398,7 +456,7 @@ OLLAMA_MODEL=qwen2.5:32b-instruct ```env EMBEDDING_MODEL=sentence-transformers/all-MiniLM-L6-v2 RERANKER_MODEL=BAAI/bge-reranker-base -OLLAMA_MODEL=qwen2.5:14b-instruct +TRANSFORMERS_MODEL=Qwen/Qwen2.5-14B-Instruct ``` - **Speed:** Very Fast - **Quality:** Good @@ -409,7 +467,7 @@ OLLAMA_MODEL=qwen2.5:14b-instruct ```env EMBEDDING_MODEL=BAAI/bge-m3 RERANKER_MODEL=BAAI/bge-reranker-v2-m3 -OLLAMA_MODEL=qwen2.5:14b-instruct +TRANSFORMERS_MODEL=Qwen/Qwen2.5-14B-Instruct ``` - **Speed:** Moderate - **Quality:** Excellent @@ -489,18 +547,18 @@ RERANKER_MODEL=BAAI/bge-reranker-v2-m3 - The prompt automatically instructs it to respond in the question's language **Alternative Multilingual LLMs:** -```bash -ollama pull qwen2.5:32b-instruct # Best multilingual quality -ollama pull llama3.1:8b # Good for European languages -ollama pull mistral:7b-instruct # Good for French/English +```env +# In .env file +TRANSFORMERS_MODEL=Qwen/Qwen2.5-14B-Instruct # Excellent for 100+ languages +TRANSFORMERS_MODEL=Qwen/Qwen2.5-32B-Instruct # Best multilingual quality +# Other alternatives: +# TRANSFORMERS_MODEL=meta-llama/Llama-3.1-8B-Instruct # Good for European languages ``` **Why it matters:** - Even with perfect retrieval, if LLM doesn't support the language, answers will be poor or in wrong language - Qwen models are already excellent for multilingual - upgrading mainly improves reasoning depth -**⚠️ Requires re-ingestion:** NO - just update `.env` and restart - --- ### Current System Multilingual Capability @@ -509,7 +567,7 @@ ollama pull mistral:7b-instruct # Good for French/English |-----------|---------------|---------------|------------------------| | **Embedding** | all-MiniLM-L6-v2 | ❌ English-only | 🔴 **Poor retrieval** for non-English questions | | **Reranker** | bge-reranker-base | ⚠️ English-focused | 🟡 **Suboptimal ranking** for non-English | -| **LLM** | qwen2.5:14b-instruct | ✅ Excellent | ✅ **Perfect responses** in any language | +| **LLM** | Qwen2.5-14B-Instruct | ✅ Excellent | ✅ **Perfect responses** in any language | **Result:** The LLM **CAN respond** in French/Spanish/etc., but will work with **lower-quality context** retrieved by English-only embeddings. @@ -523,7 +581,7 @@ ollama pull mistral:7b-instruct # Good for French/English # In .env file EMBEDDING_MODEL=BAAI/bge-m3 RERANKER_MODEL=BAAI/bge-reranker-v2-m3 -OLLAMA_MODEL=qwen2.5:14b-instruct +TRANSFORMERS_MODEL=Qwen/Qwen2.5-14B-Instruct ``` **Steps:** @@ -662,3 +720,5 @@ CHUNK_OVERLAP=150 | 1500/300 | ~15,000 | Slower | Most Complete | **Rule of Thumb:** Overlap should be 10-20% of chunk size for optimal results. + +--- diff --git a/frontend/app.py b/frontend/app.py index 3a23d60..b6a8523 100644 --- a/frontend/app.py +++ b/frontend/app.py @@ -64,8 +64,7 @@ class Handler(BaseHTTPRequestHandler): if not question: self._send(400, json.dumps({"error": "Question is required"})) return - provider = os.getenv("MODEL_PROVIDER", "ollama").lower() - answer, model_name, sources = run_query_complete(question, provider) + answer, model_name, sources = run_query_complete(question) self._send(200, json.dumps({"answer": answer, "model": model_name, "sources": sources})) except Exception as exc: import traceback diff --git a/rag/ingest.py b/rag/ingest.py index bf580b4..b6dcb43 100644 --- a/rag/ingest.py +++ b/rag/ingest.py @@ -8,7 +8,7 @@ from typing import List import re import xml.etree.ElementTree as ET -from langchain_community.document_loaders import ( # type: ignore +from langchain_community.document_loaders import ( DirectoryLoader, TextLoader, UnstructuredPowerPointLoader, @@ -19,28 +19,27 @@ from langchain_community.document_loaders import ( # type: ignore UnstructuredMarkdownLoader, UnstructuredWordDocumentLoader ) -from langchain_text_splitters import RecursiveCharacterTextSplitter # type: ignore -from langchain_community.embeddings import FastEmbedEmbeddings # type: ignore -from langchain_huggingface import HuggingFaceEmbeddings # type: ignore -from langchain_chroma import Chroma # type: ignore -from langchain_core.documents import Document # type: ignore +from langchain_text_splitters import RecursiveCharacterTextSplitter +from langchain_community.embeddings import FastEmbedEmbeddings +from langchain_huggingface import HuggingFaceEmbeddings +from langchain_chroma import Chroma +from langchain_core.documents import Document from concurrent.futures import ThreadPoolExecutor, as_completed import logging from datetime import datetime try: - from docx import Document as DocxDocument # type: ignore + from docx import Document as DocxDocument PYTHON_DOCX_AVAILABLE = True except ImportError: PYTHON_DOCX_AVAILABLE = False try: - from docx import Document as DocxDocument # type: ignore + from docx import Document as DocxDocument PYTHON_DOCX_AVAILABLE = True except ImportError: PYTHON_DOCX_AVAILABLE = False - def load_docx_with_python_docx(file_path: str) -> List[Document]: if not PYTHON_DOCX_AVAILABLE: raise ImportError("python-docx not available") @@ -50,7 +49,6 @@ def load_docx_with_python_docx(file_path: str) -> List[Document]: raise ValueError("No text extracted") return [Document(page_content=text, metadata={"source": Path(file_path).name})] - def load_docx_raw_xml(file_path: str) -> List[Document]: with zipfile.ZipFile(file_path, 'r') as docx_zip: try: @@ -68,7 +66,6 @@ def load_docx_raw_xml(file_path: str) -> List[Document]: except Exception as e: raise ValueError(f"Failed to extract from XML: {e}") - def extract_zip_files(docs_dir: Path) -> None: zip_files = list(docs_dir.glob("**/*.zip")) @@ -129,7 +126,6 @@ def extract_zip_files(docs_dir: Path) -> None: print() - def load_documents_batch(docs_dir: Path, batch_size: int = 50) -> tuple[List[Document], dict]: all_docs = [] stats = { @@ -265,7 +261,6 @@ def load_documents_batch(docs_dir: Path, batch_size: int = 50) -> tuple[List[Doc return all_docs, stats - def main() -> None: load_dotenv() @@ -334,16 +329,18 @@ def main() -> None: embedding_provider = os.getenv("EMBEDDING_PROVIDER", "fastembed").lower() embedding_model = os.getenv("EMBEDDING_MODEL") + embedding_device = os.getenv("EMBEDDING_DEVICE", "cuda").lower() print(f"\nSetting up embeddings...") print(f" Provider: {embedding_provider}") print(f" Model: {embedding_model}") + print(f" Device: {embedding_device}") if embedding_provider == "huggingface": print(" Using HuggingFace embeddings (supports any model)") embeddings = HuggingFaceEmbeddings( model_name=embedding_model, - model_kwargs={'device': 'cpu'}, + model_kwargs={'device': embedding_device}, encode_kwargs={'normalize_embeddings': True} ) else: @@ -406,6 +403,5 @@ def main() -> None: print(f"\n✅ Vector store is ready for querying") - if __name__ == "__main__": main() diff --git a/rag/query.py b/rag/query.py index 9002067..3d748ed 100644 --- a/rag/query.py +++ b/rag/query.py @@ -4,7 +4,10 @@ from pathlib import Path from typing import List from dotenv import load_dotenv -from langchain_ollama import ChatOllama # type: ignore +try: + from .transformers_llm import TransformersLLM # type: ignore +except Exception: + from transformers_llm import TransformersLLM # type: ignore from langchain_community.embeddings import FastEmbedEmbeddings # type: ignore from langchain_huggingface import HuggingFaceEmbeddings # type: ignore from langchain_chroma import Chroma # type: ignore @@ -12,6 +15,17 @@ from langchain_core.documents import Document # type: ignore from langchain_core.prompts import ChatPromptTemplate # type: ignore from sentence_transformers import CrossEncoder import time +import hashlib +import random + +try: + import numpy as _np +except Exception: + _np = None +try: + import torch as _torch +except Exception: + _torch = None class Reranker: @@ -57,9 +71,41 @@ def get_embeddings(): def get_llm(): - model_name = os.getenv("OLLAMA_MODEL") - base_url = os.getenv("OLLAMA_BASE_URL") - return ChatOllama(model=model_name, base_url=base_url), model_name + + load_dotenv() + + model_name = os.getenv("TRANSFORMERS_MODEL") + quantization = os.getenv("QUANTIZATION", "none") + device = os.getenv("LLM_DEVICE", "auto") + seed = int(os.getenv("LLM_SEED", "0")) + + if seed and seed > 0: + random.seed(seed) + if _np is not None: + _np.random.seed(seed) + if _torch is not None: + try: + _torch.manual_seed(seed) + _torch.cuda.manual_seed_all(seed) + _torch.use_deterministic_algorithms(True) + except Exception: + pass + + global _LLM_CACHE + try: + _LLM_CACHE + except NameError: + _LLM_CACHE = {} + + cache_key = (model_name, quantization, device) + if cache_key in _LLM_CACHE: + return _LLM_CACHE[cache_key], _LLM_CACHE[cache_key].model_name + + + llm = TransformersLLM() + + _LLM_CACHE[cache_key] = llm + return llm, llm.model_name @@ -176,15 +222,24 @@ Write your answer now (aim for 300-500+ words with extensive technical detail):" llm, model_name = get_llm() chain = prompt | llm - + context_text = format_docs(docs) + temp = os.getenv("TEMPERATURE", str(getattr(llm, "temperature", "unknown"))) + seed = os.getenv("LLM_SEED", "0") + sources = [d.metadata.get("source", "unknown") for d in docs] + prompt_payload = f"question:{query}\ncontext:{context_text}" + prompt_hash = hashlib.sha256(prompt_payload.encode("utf-8")).hexdigest() answer = "" for chunk in chain.stream({"question": query, "context": context_text}): - answer += chunk.content - - sources = [d.metadata.get('source', 'unknown') for d in docs] - + if hasattr(chunk, "content"): + answer += chunk.content + elif hasattr(chunk, "text"): + answer += chunk.text + else: + answer += str(chunk) + + sources = [d.metadata.get("source", "unknown") for d in docs] return answer, model_name, sources @@ -312,16 +367,26 @@ Write your answer now (aim for 300-500+ words with extensive technical detail):" llm, model_name = get_llm() chain = prompt | llm + context_text = format_docs(docs) + temp = os.getenv("TEMPERATURE", str(getattr(llm, "temperature", "unknown"))) + seed = os.getenv("LLM_SEED", "0") + sources = [d.metadata.get("source", "unknown") for d in docs] + prompt_payload = f"question:{query}\ncontext:{context_text}" + prompt_hash = hashlib.sha256(prompt_payload.encode("utf-8")).hexdigest() print(f"Querying model: {model_name}\n") - context_text = format_docs(docs) print("=== Answer ===\n") query_start = time.time() try: for chunk in chain.stream({"question": query, "context": context_text}): - print(chunk.content, end="", flush=True) + if hasattr(chunk, "content"): + print(chunk.content, end="", flush=True) + elif hasattr(chunk, "text"): + print(chunk.text, end="", flush=True) + else: + print(str(chunk), end="", flush=True) print(f"\n\n[Query completed in {time.time() - query_start:.2f}s]") except Exception as e: msg = str(e).lower() @@ -340,4 +405,4 @@ Write your answer now (aim for 300-500+ words with extensive technical detail):" if __name__ == "__main__": - main() + main() \ No newline at end of file diff --git a/rag/transformers_llm.py b/rag/transformers_llm.py new file mode 100644 index 0000000..5b0b0c5 --- /dev/null +++ b/rag/transformers_llm.py @@ -0,0 +1,280 @@ +import os +os.environ.setdefault("TRANSFORMERS_VERBOSITY", os.getenv("TRANSFORMERS_VERBOSITY", "error")) + +from typing import Any, Iterator, Optional, List +import re +from threading import Thread + +import torch +from transformers import ( + AutoModelForCausalLM, + AutoTokenizer, + BitsAndBytesConfig, + TextIteratorStreamer, + GenerationConfig, +) + +from langchain_core.language_models.llms import BaseLLM +from langchain_core.callbacks.manager import CallbackManagerForLLMRun +from langchain_core.outputs import GenerationChunk, LLMResult, Generation + + +class TransformersLLM(BaseLLM): + + model_name: str = "Qwen/Qwen2.5-14B-Instruct" + max_new_tokens: int = 2048 + temperature: float = 0.7 + model: Any = None + tokenizer: Any = None + + def __init__( + self, + model_name: str = "Qwen/Qwen2.5-14B-Instruct", + device: str = "auto", + dtype: str = "auto", + max_new_tokens: int = 2048, + temperature: float = 0.7, + quantization: str = "none", + **kwargs, + ): + env_model = os.getenv("TRANSFORMERS_MODEL") + env_quant = os.getenv("QUANTIZATION") + env_device = os.getenv("LLM_DEVICE") + env_max_tokens = os.getenv("MAX_NEW_TOKENS") + env_temp = os.getenv("TEMPERATURE") + + if env_model: + model_name = env_model + if env_quant: + quantization = env_quant + if env_device: + device = env_device + if env_max_tokens: + try: + max_new_tokens = int(env_max_tokens) + except Exception: + pass + if env_temp: + try: + temperature = float(env_temp) + except Exception: + pass + + super().__init__( + model_name=model_name, + max_new_tokens=max_new_tokens, + temperature=temperature, + **kwargs, + ) + + print(f"Loading model: {model_name}") + if quantization == "4bit": + print("Using 4-bit quantization (reduces memory ~75%)") + elif quantization == "8bit": + print("Using 8-bit quantization (reduces memory ~50%)") + else: + print("This will download ~28GB on first run...") + + dtype_map = { + "float16": torch.float16, + "bfloat16": torch.bfloat16, + "float32": torch.float32, + "auto": "auto", + } + dtype_obj = dtype_map.get(dtype, "auto") + + self.tokenizer = AutoTokenizer.from_pretrained(model_name) + + quantization_config = None + if quantization == "4bit": + quantization_config = BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_compute_dtype=torch.float16, + bnb_4bit_use_double_quant=True, + bnb_4bit_quant_type="nf4", + ) + elif quantization == "8bit": + quantization_config = BitsAndBytesConfig(load_in_8bit=True) + + model_kwargs = {"device_map": device, "trust_remote_code": True} + if quantization_config: + model_kwargs["quantization_config"] = quantization_config + else: + model_kwargs["dtype"] = dtype_obj + + self.model = AutoModelForCausalLM.from_pretrained(model_name, **model_kwargs) + + self._generation_lock = None + try: + import threading + + self._generation_lock = threading.Lock() + except Exception: + self._generation_lock = None + + print(f"✓ Model loaded successfully on {self.model.device}") + + @property + def _llm_type(self) -> str: + return "transformers" + + def _generate( + self, + prompts: List[str], + stop: Optional[List[str]] = None, + run_manager: Optional[CallbackManagerForLLMRun] = None, + **kwargs: Any, + ) -> LLMResult: + generations = [] + for prompt in prompts: + text = self._call(prompt, stop=stop, run_manager=run_manager, **kwargs) + generations.append([Generation(text=text)]) + return LLMResult(generations=generations) + + def _call( + self, + prompt: str, + stop: Optional[list[str]] = None, + run_manager: Optional[CallbackManagerForLLMRun] = None, + **kwargs: Any, + ) -> str: + inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device) + temp = float(kwargs.get("temperature", self.temperature)) + do_sample = bool(kwargs.get("do_sample", temp > 0)) + top_p = kwargs.get("top_p", 0.9) + top_k = kwargs.get("top_k", None) + + gen_kwargs = dict( + **inputs, + max_new_tokens=self.max_new_tokens, + pad_token_id=self.tokenizer.eos_token_id, + eos_token_id=self.tokenizer.eos_token_id, + use_cache=True, + repetition_penalty=1.1, + ) + + try: + gen_cfg = GenerationConfig(do_sample=do_sample) + if do_sample: + gen_cfg.temperature = temp + if top_p is not None: + gen_cfg.top_p = float(top_p) + if top_k is not None: + gen_cfg.top_k = int(top_k) + gen_kwargs["generation_config"] = gen_cfg + except Exception: + gen_kwargs["do_sample"] = do_sample + if do_sample: + gen_kwargs["temperature"] = temp + if top_p is not None: + gen_kwargs["top_p"] = float(top_p) + if top_k is not None: + gen_kwargs["top_k"] = int(top_k) + + with torch.no_grad(): + lock = getattr(self, "_generation_lock", None) + if lock is not None: + lock.acquire() + try: + outputs = self.model.generate(**gen_kwargs) + finally: + if lock is not None: + lock.release() + + response = self.tokenizer.decode( + outputs[0][inputs["input_ids"].shape[1] :], skip_special_tokens=True + ) + + def _clean_warnings(s: str) -> str: + if not s: + return s + patterns = [ + r"The following generation flags are not valid[\s\S]*?$", + r"Set `TRANSFORMERS_VERBOSITY=info`[\s\S]*?$", + r"transformers_verbosity", + ] + out = s.replace("\r\n", "\n") + for p in patterns: + out = re.sub(p, "", out, flags=re.IGNORECASE) + out = re.sub(r"\n{3,}", "\n\n", out) + return out.strip() + + return _clean_warnings(response) + + def _stream( + self, + prompt: str, + stop: Optional[list[str]] = None, + run_manager: Optional[CallbackManagerForLLMRun] = None, + **kwargs: Any, + ) -> Iterator[GenerationChunk]: + inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device) + + streamer = TextIteratorStreamer(self.tokenizer, skip_prompt=True, skip_special_tokens=True) + + temp = float(kwargs.get("temperature", self.temperature)) + do_sample = bool(kwargs.get("do_sample", temp > 0)) + top_p = kwargs.get("top_p", 0.9) + top_k = kwargs.get("top_k", None) + + generation_kwargs = { + **inputs, + "max_new_tokens": self.max_new_tokens, + "pad_token_id": self.tokenizer.eos_token_id, + "eos_token_id": self.tokenizer.eos_token_id, + "streamer": streamer, + "use_cache": True, + "repetition_penalty": 1.1, + } + + try: + gen_cfg = GenerationConfig(do_sample=do_sample) + if do_sample: + gen_cfg.temperature = temp + if top_p is not None: + gen_cfg.top_p = float(top_p) + if top_k is not None: + gen_cfg.top_k = int(top_k) + generation_kwargs["generation_config"] = gen_cfg + except Exception: + generation_kwargs["do_sample"] = do_sample + if do_sample: + generation_kwargs["temperature"] = temp + if top_p is not None: + generation_kwargs["top_p"] = float(top_p) + if top_k is not None: + generation_kwargs["top_k"] = int(top_k) + + thread = Thread(target=self.model.generate, kwargs=generation_kwargs) + thread.start() + + buffer = "" + patterns = [ + r"The following generation flags are not valid[\s\S]*?(?:$|\n)", + r"may be ignored[\s\S]*?(?:$|\n)", + r"Set `TRANSFORMERS_VERBOSITY=info`[\s\S]*?(?:$|\n)", + r"transformers_verbosity", + r"\[.*temperature.*\]", + ] + + for fragment in streamer: + chunk_text = fragment if isinstance(fragment, str) else str(fragment) + buffer += chunk_text + for p in patterns: + buffer = re.sub(p, "", buffer, flags=re.IGNORECASE) + if "\n" in buffer: + to_emit, buffer = buffer.rsplit("\n", 1) + to_emit = to_emit + "\n" + to_emit = re.sub(r"\n{3,}", "\n\n", to_emit) + if to_emit.strip(): + yield GenerationChunk(text=to_emit) + + thread.join() + if buffer: + for p in patterns: + buffer = re.sub(p, "", buffer, flags=re.IGNORECASE) + buffer = re.sub(r"\n{3,}", "\n\n", buffer) + if buffer.strip(): + if not buffer.endswith("\n"): + buffer = buffer + "\n" + yield GenerationChunk(text=buffer) diff --git a/requirements.txt b/requirements.txt index 1a696bd..9fa605b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,5 @@ langchain>=1.0.8 langchain-community>=0.4.1 -langchain-ollama>=1.0.0 langchain-chroma>=0.1.2 langchain-huggingface>=0.1.0 chromadb>=1.3.5 @@ -18,4 +17,8 @@ pdf2image>=1.17.0 unstructured-inference>=0.7.36 pdfminer.six>=20231228 flashrank>=0.2.0 -sentence-transformers>=2.2.0 \ No newline at end of file +sentence-transformers>=2.2.0 +transformers>=4.40.0 +torch>=2.0.0 +accelerate>=0.27.0 +bitsandbytes>=0.41.0 \ No newline at end of file -- GitLab