From c2a4798be995bcd2203cb070e47d5db9ca510001 Mon Sep 17 00:00:00 2001
From: Minh Hoang Anh TRAN <tranminhhoanganh2005@gmail.com>
Date: Tue, 24 Feb 2026 14:21:12 +0100
Subject: [PATCH 1/2] ajout du multilingue

---
 .env.example     | 214 ++++++++++++++++++++++++++++++++++++++++++-----
 README.md        | 139 ++++++++++++++++++++++++++++++
 rag/ingest.py    |  26 ++++--
 rag/query.py     |  51 ++++++++---
 requirements.txt |   1 +
 5 files changed, 393 insertions(+), 38 deletions(-)
diff --git a/.env.example b/.env.example
index 82a7165..88339bf 100644
--- a/.env.example
+++ b/.env.example
@@ -1,20 +1,50 @@
 ﻿# ============================================
 # MODEL PROVIDER CONFIGURATION
 # ============================================
-# Choose your LLM provider: ollama, mistral, or openai
 MODEL_PROVIDER=ollama
 
 # --------------------------------------------
 # OLLAMA SETTINGS (Local models: Qwen, Mistral, Llama, etc.)
 # --------------------------------------------
-# Current: Excellent balance of speed and quality (14B params, requires 16GB RAM)
+# CURRENT: Mid-end configuration (16GB RAM, mid-end GPU/CPU)
 OLLAMA_MODEL=qwen2.5:14b-instruct
 OLLAMA_BASE_URL=http://localhost:11434
 
-# Alternative LLM models (run 'ollama pull <model>' first):
-# OLLAMA_MODEL=qwen2.5:7b-instruct      # Faster: 7B params, requires 8GB RAM, 2x faster
-# OLLAMA_MODEL=qwen2.5:32b-instruct     # Better: 32B params, requires 32GB RAM, higher quality
-# OLLAMA_MODEL=qwen2.5:72b-instruct     # Best: 72B params, requires 64GB RAM, maximum quality
+# ============================================
+# HARDWARE-BASED MODEL RECOMMENDATIONS
+# ============================================
+# Choose the configuration that matches your hardware:
+
+# --- [1] LOW-END CONFIG (8GB RAM, low-end GPU, low-end CPU) ---
+# Best for: Budget laptops, older computers, testing
+# OLLAMA_MODEL=qwen2.5:3b-instruct
+# Alternative: qwen2.5:7b-instruct (if you can spare the RAM)
+# Expected speed: 5-10 seconds/query (CPU), 1-3 seconds (GPU)
+# RAM usage: ~4-6GB
+
+# --- [2] MID-END CONFIG (16GB RAM, mid-end GPU, mid-end CPU) --- ✅ CURRENT
+# Best for: Modern laptops, standard workstations
+# OLLAMA_MODEL=qwen2.5:14b-instruct
+# Alternative: qwen2.5:7b-instruct (faster)
+# Expected speed: 8-15 seconds/query (CPU), 2-4 seconds (GPU)
+# RAM usage: ~10-14GB
+
+# --- [3] BETTER MID-END CONFIG (16GB RAM, modern GPU, modern CPU) ---
+# Best for: Gaming PCs, modern workstations with RTX/RX GPUs
+# OLLAMA_MODEL=qwen2.5:14b-instruct
+# Alternative: qwen2.5:32b-instruct (if GPU has 12GB+ VRAM)
+# Expected speed: 1-2 seconds/query (good GPU), 6-10 seconds (CPU)
+# RAM usage: ~12-16GB
+# Note: Modern GPU makes huge difference, can handle 32b with quantization
+
+# --- [4] HIGH-END CONFIG (32GB+ RAM, high-end GPU, high-end CPU) ---
+# Best for: Workstations, servers, RTX 4090/A6000, ThreadRipper/Xeon
+# OLLAMA_MODEL=qwen2.5:32b-instruct
+# Alternatives:
+#   qwen2.5:72b-instruct    (64GB+ RAM)
+#   qwen2.5:110b-instruct   (80-128GB RAM, ultimate quality)
+# Expected speed: 0.5-1.5 seconds/query (high-end GPU), 15-30 seconds (CPU)
+# RAM usage: ~24-40GB (32b), ~50-80GB (72b), ~90-120GB (110b)
 
 # ============================================
 # DOCUMENT PROCESSING
@@ -55,17 +85,103 @@ CHUNK_OVERLAP=100       # 12.5% overlap prevents context loss at boundaries
 # ============================================
 # EMBEDDING MODEL CONFIGURATION
 # ============================================
-# FastEmbed provider (local, fast, no Ollama dependency)
-EMBEDDING_PROVIDER=fastembed
 
-# --- EMBEDDING MODEL OPTIONS ---
-# Current: FAST model for testing/development (384-dim, 5x faster)
-EMBEDDING_MODEL=sentence-transformers/all-MiniLM-L6-v2
+# --- EMBEDDING PROVIDER SELECTION ---
+# Choose: "fastembed" (faster, limited models) or "huggingface" (flexible, any model)
+EMBEDDING_PROVIDER=huggingface
+
+# CURRENT: Mid-end configuration (multilingual)
+EMBEDDING_MODEL=intfloat/multilingual-e5-large-instruct
+
+# ============================================
+# HOW TO SWITCH EMBEDDING PROVIDERS
+# ============================================
+# 
+# OPTION 1: FastEmbed (faster, optimized, limited model support)
+# EMBEDDING_PROVIDER=fastembed
+# EMBEDDING_MODEL=BAAI/bge-m3
+# 
+# OPTION 2: HuggingFace (flexible, supports any model, slightly slower)
+# EMBEDDING_PROVIDER=huggingface
+# EMBEDDING_MODEL=intfloat/multilingual-e5-large-instruct
+#
+# ⚠️ After changing provider or model: run `python rag/ingest.py`
+
+# ============================================
+# HARDWARE-BASED EMBEDDING MODEL RECOMMENDATIONS
+# ============================================
+# Choose the configuration that matches your hardware:
+
+# --- [1] LOW-END CONFIG (8GB RAM, low-end GPU, low-end CPU) ---
+# Best for: Budget systems, fast ingestion, English-only or primary language focus
+# EMBEDDING_PROVIDER=fastembed
+# EMBEDDING_MODEL=sentence-transformers/all-MiniLM-L6-v2
+# Specs: 384-dim, 22M params, 5x faster than bge-m3
+# Quality: Good for English, poor for other languages
+# Ingestion speed: ~500-800 docs/minute
+# RAM usage: ~500MB
+# Multilingual: ❌ English-only optimized
+# Provider: ✅ FastEmbed supported
+#
+# Multilingual alternative for low-end:
+# EMBEDDING_PROVIDER=fastembed  # or huggingface
+# EMBEDDING_MODEL=sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2
+# Specs: 384-dim, slower than all-MiniLM but supports 50+ languages
+# Quality: Good for most languages
+# Ingestion speed: ~300-500 docs/minute
+# Provider: ✅ Both FastEmbed and HuggingFace
+
+# --- [2] MID-END CONFIG (16GB RAM, mid-end GPU, mid-end CPU) --- 
+# Best for: Multilingual document collections, good balance
+# EMBEDDING_PROVIDER=fastembed
+# EMBEDDING_MODEL=BAAI/bge-m3
+# Specs: 1024-dim, 568M params, excellent multilingual (100+ languages)
+# Quality: Excellent for all languages
+# Ingestion speed: ~150-250 docs/minute
+# RAM usage: ~2GB
+# Multilingual: ✅ Excellent (100+ languages)
+# Provider: ✅ FastEmbed supported
+#
+# English-only alternative for mid-end (faster):
+# EMBEDDING_PROVIDER=fastembed
+# EMBEDDING_MODEL=BAAI/bge-base-en-v1.5
+# Specs: 768-dim, 2x faster than bge-m3, English-only
+# Quality: Excellent for English
+# Ingestion speed: ~250-400 docs/minute
+# Provider: ✅ FastEmbed supported
+
+# --- [3] BETTER MID-END CONFIG (16GB RAM, modern GPU, modern CPU) --- ✅ CURRENT
+# Best for: High-quality multilingual RAG, modern systems
+# Option A: Use FastEmbed (faster)
+# EMBEDDING_PROVIDER=fastembed
+# EMBEDDING_MODEL=BAAI/bge-m3
+#
+# Option B: Use HuggingFace for better complex query understanding
+# EMBEDDING_PROVIDER=huggingface
+# EMBEDDING_MODEL=intfloat/multilingual-e5-large-instruct
+# Specs: 1024-dim, 560M params, instruction-tuned for RAG
+# Quality: Excellent+ (better understanding of complex queries)
+# Ingestion speed: ~120-200 docs/minute
+# RAM usage: ~2.5GB
+# Multilingual: ✅ Excellent (100+ languages)
+# Note: E5-large-instruct handles technical jargon and complex questions better
 
-# Production alternatives (uncomment to upgrade):
-# EMBEDDING_MODEL=BAAI/bge-base-en-v1.5      # Balanced: 768-dim, 2x faster than large
-# EMBEDDING_MODEL=BAAI/bge-large-en-v1.5     # Best Quality: 1024-dim, excellent retrieval
-# EMBEDDING_MODEL=BAAI/bge-m3                # Multilingual: 1024-dim, 100+ languages
+# --- [4] HIGH-END CONFIG (32GB+ RAM, high-end GPU, high-end CPU) ---
+# Best for: Maximum quality, production systems, critical applications
+# EMBEDDING_PROVIDER=huggingface
+# EMBEDDING_MODEL=intfloat/multilingual-e5-large-instruct
+# Specs: 1024-dim, 560M params, instruction-tuned, state-of-the-art RAG
+# Quality: Best available for open-source multilingual embeddings
+# Ingestion speed: ~100-180 docs/minute (with parallelization)
+# RAM usage: ~2.5-3GB
+# Multilingual: ✅ Excellent (100+ languages)
+# Best for: Complex technical documents, research papers, precise retrieval
+# Provider: ⚠️ HuggingFace only (not supported by FastEmbed)
+#
+# Alternative (if you need even larger context):
+# EMBEDDING_PROVIDER=fastembed
+# EMBEDDING_MODEL=BAAI/bge-large-en-v1.5
+# English-only but highest quality for English technical documents
 
 # ============================================
 # RETRIEVAL & RERANKING CONFIGURATION
@@ -74,10 +190,66 @@ RETRIEVAL_CHUNKS=100  # Initial chunks to retrieve (more = better recall, recomm
 TOP_N_RERANK=8        # Keep best N after reranking (recommended: 5-15)
 USE_RERANKING=true    # Enable reranking for better relevance (highly recommended)
 
-# --- RERANKER MODEL OPTIONS ---
-# Current: Good balance of speed and quality (278M params)
-RERANKER_MODEL=BAAI/bge-reranker-base
+# CURRENT: Mid-end configuration (multilingual)
+RERANKER_MODEL=BAAI/bge-reranker-v2-m3
 
-# Production alternatives (uncomment to upgrade):
-# RERANKER_MODEL=BAAI/bge-reranker-large     # Higher quality: 560M params, 2x slower
-# RERANKER_MODEL=BAAI/bge-reranker-v2-m3     # State-of-the-art: 568M params, multilingual
+# ============================================
+# HARDWARE-BASED RERANKER MODEL RECOMMENDATIONS
+# ============================================
+# Choose the configuration that matches your hardware:
+
+# --- [1] LOW-END CONFIG (8GB RAM, low-end GPU, low-end CPU) ---
+# Best for: Budget systems, fast queries
+# Option A: Disable reranking for maximum speed
+# USE_RERANKING=false
+# TOP_N_RERANK=8  # Not used when reranking disabled
+# Speed: Instant (no reranking overhead)
+#
+# Option B: Lightweight reranker (recommended if you can spare 2GB RAM)
+# RERANKER_MODEL=BAAI/bge-reranker-base
+# Specs: 278M params, English-focused but works for other languages
+# Quality: Good
+# Query overhead: +0.3-0.5 seconds
+# RAM usage: ~1-1.5GB
+# Multilingual: ⚠️ English-focused, acceptable for others
+
+# --- [2] MID-END CONFIG (16GB RAM, mid-end GPU, mid-end CPU) --- ✅ CURRENT
+# Best for: Multilingual systems, good quality/speed balance
+# RERANKER_MODEL=BAAI/bge-reranker-v2-m3
+# Specs: 568M params, state-of-the-art multilingual reranker
+# Quality: Excellent for all languages
+# Query overhead: +0.5-1 second
+# RAM usage: ~2-3GB
+# Multilingual: ✅ Excellent (100+ languages)
+# Top N: 8-10 recommended
+#
+# English-only alternative (slightly faster):
+# RERANKER_MODEL=BAAI/bge-reranker-large
+# Specs: 560M params, English-only, similar speed to v2-m3
+# Quality: Excellent for English
+
+# --- [3] BETTER MID-END CONFIG (16GB RAM, modern GPU, modern CPU) ---
+# Best for: Modern systems with good CPU, multilingual quality
+# RERANKER_MODEL=BAAI/bge-reranker-v2-m3
+# Specs: Same as mid-end, but modern CPU handles it faster
+# Quality: Excellent
+# Query overhead: +0.3-0.6 seconds (faster CPU)
+# RAM usage: ~2-3GB
+# Top N: 10-12 recommended (can afford more reranking)
+# Note: Good CPU makes reranking much faster, can increase TOP_N_RERANK
+
+# --- [4] HIGH-END CONFIG (32GB+ RAM, high-end GPU, high-end CPU) ---
+# Best for: Maximum accuracy, production systems
+# RERANKER_MODEL=BAAI/bge-reranker-v2-minicpm-layerwise
+# Specs: 2.4B params, state-of-the-art quality, multilingual
+# Quality: Best available (4x more parameters than v2-m3)
+# Query overhead: +1.5-3 seconds (3-5x slower than v2-m3)
+# RAM usage: ~8-16GB (just for reranker!)
+# Multilingual: ✅ Excellent (100+ languages)
+# Top N: 12-15 recommended
+# Best for: Critical applications where accuracy > speed
+#
+# Alternative (balanced high-end):
+# RERANKER_MODEL=BAAI/bge-reranker-v2-m3
+# Use current model but with higher TOP_N_RERANK=15
+# Faster queries while maintaining excellent quality
diff --git a/README.md b/README.md
index 388eb82..830b1d9 100644
--- a/README.md
+++ b/README.md
@@ -25,6 +25,11 @@ A Retrieval-Augmented Generation (RAG) system that lets you query your documents
   - [Upgrading to Production Quality](#upgrading-to-production-quality)
   - [🏆 Recommended Production Configurations](#-recommended-production-configurations)
   - [⚡ Performance Impact Summary](#-performance-impact-summary)
+- [🌍 Multilingual Functionality Guide](#-multilingual-functionality-guide)
+  - [How Each Component Affects Multilingual Support](#how-each-component-affects-multilingual-support)
+  - [Current System Multilingual Capability](#current-system-multilingual-capability)
+  - [Upgrading to Full Multilingual Support](#upgrading-to-full-multilingual-support)
+  - [Testing Multilingual Functionality](#testing-multilingual-functionality)
 - [Chunking Configuration Guide](#chunking-configuration-guide)
   - [What is Chunking?](#what-is-chunking)
   - [Current Default Settings](#current-default-settings)
@@ -423,6 +428,140 @@ OLLAMA_MODEL=qwen2.5:14b-instruct
 
 ---
 
+## 🌍 Multilingual Functionality Guide
+
+The chatbot **automatically responds in the language you use** to ask questions (English, French, Spanish, etc.). However, **each model component affects multilingual quality differently**:
+
+### How Each Component Affects Multilingual Support
+
+#### **1. Embedding Model - CRITICAL for Multilingual Retrieval** 🔴
+
+**Impact:** Determines if your question in ANY language can find relevant documents
+
+**Current Model:** `sentence-transformers/all-MiniLM-L6-v2`
+- ⚠️ **English-only optimized**
+- Non-English queries will retrieve less relevant documents
+- Works for English, poor for French/Spanish/other languages
+
+**Recommended for Multilingual:**
+```env
+EMBEDDING_MODEL=BAAI/bge-m3
+# or
+EMBEDDING_MODEL=sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2
+```
+
+**Why it matters:**
+- French question → English-focused embeddings → retrieves wrong documents → LLM gets irrelevant context → poor answer **even if LLM speaks French**
+- Multilingual embeddings → retrieves correct documents in any language → LLM gets relevant context → excellent answer
+
+**⚠️ Requires re-ingestion:** YES - `python rag/ingest.py`
+
+---
+
+#### **2. Reranker Model - Important for Multilingual Precision** 🟡
+
+**Impact:** Refines which documents are most relevant to your question
+
+**Current Model:** `BAAI/bge-reranker-base`
+- ⚠️ **English-focused**
+- Can rerank, but less accurate for non-English queries
+
+**Recommended for Multilingual:**
+```env
+RERANKER_MODEL=BAAI/bge-reranker-v2-m3
+```
+
+**Why it matters:**
+- Even if embeddings retrieve 10 good multilingual documents, English-only reranker might rank them poorly
+- Multilingual reranker correctly identifies the most relevant chunks in any language
+
+**⚠️ Requires re-ingestion:** NO - just update `.env` and restart
+
+---
+
+#### **3. LLM (Text Generation Model) - Determines Answer Language** 🟢
+
+**Impact:** Generates the actual response in the target language
+
+**Current Model:** `qwen2.5:14b-instruct`
+- ✅ **Excellent multilingual support** (100+ languages)
+- Strong in: English, Chinese, French, Spanish, German, Japanese, Korean, Arabic, and more
+- The prompt automatically instructs it to respond in the question's language
+
+**Alternative Multilingual LLMs:**
+```bash
+ollama pull qwen2.5:32b-instruct    # Best multilingual quality
+ollama pull llama3.1:8b             # Good for European languages
+ollama pull mistral:7b-instruct     # Good for French/English
+```
+
+**Why it matters:**
+- Even with perfect retrieval, if LLM doesn't support the language, answers will be poor or in wrong language
+- Qwen models are already excellent for multilingual - upgrading mainly improves reasoning depth
+
+**⚠️ Requires re-ingestion:** NO - just update `.env` and restart
+
+---
+
+### Current System Multilingual Capability
+
+| Component | Current Model | Multilingual? | Impact on Non-English |
+|-----------|---------------|---------------|------------------------|
+| **Embedding** | all-MiniLM-L6-v2 | ❌ English-only | 🔴 **Poor retrieval** for non-English questions |
+| **Reranker** | bge-reranker-base | ⚠️ English-focused | 🟡 **Suboptimal ranking** for non-English |
+| **LLM** | qwen2.5:14b-instruct | ✅ Excellent | ✅ **Perfect responses** in any language |
+
+**Result:** The LLM **CAN respond** in French/Spanish/etc., but will work with **lower-quality context** retrieved by English-only embeddings.
+
+---
+
+### Upgrading to Full Multilingual Support
+
+**Recommended Configuration:**
+
+```env
+# In .env file
+EMBEDDING_MODEL=BAAI/bge-m3
+RERANKER_MODEL=BAAI/bge-reranker-v2-m3
+OLLAMA_MODEL=qwen2.5:14b-instruct
+```
+
+**Steps:**
+1. Update `.env` with multilingual models
+2. Re-ingest documents: `python rag/ingest.py` (required for embedding change)
+3. Restart frontend/queries
+
+**Benefits:**
+- ✅ Excellent retrieval for questions in **any language**
+- ✅ Accurate reranking regardless of language
+- ✅ High-quality answers in **100+ languages**
+
+**Trade-offs:**
+- Slightly slower (BGE-m3 is ~2x slower than all-MiniLM-L6-v2)
+- Larger model downloads (~3GB vs 90MB)
+
+---
+
+### Testing Multilingual Functionality
+
+```powershell
+# English
+python rag/query.py "What are the latest V-PCC compression results?"
+
+# French
+python rag/query.py "Quels sont les derniers résultats de compression V-PCC ?"
+
+# Spanish
+python rag/query.py "¿Cuáles son los últimos resultados de compresión V-PCC?"
+```
+
+**Expected behavior:**
+- ✅ LLM responds in the correct language (works with current setup)
+- ⚠️ Answer quality may be lower for non-English with current English-only embeddings
+- ✅ Full quality in all languages after upgrading to multilingual embeddings
+
+---
+
 ## Chunking Configuration Guide
 
 ### What is Chunking?
diff --git a/rag/ingest.py b/rag/ingest.py
index 05d9a6e..bf580b4 100644
--- a/rag/ingest.py
+++ b/rag/ingest.py
@@ -21,6 +21,7 @@ from langchain_community.document_loaders import (  # type: ignore
 )
 from langchain_text_splitters import RecursiveCharacterTextSplitter  # type: ignore
 from langchain_community.embeddings import FastEmbedEmbeddings  # type: ignore
+from langchain_huggingface import HuggingFaceEmbeddings  # type: ignore
 from langchain_chroma import Chroma  # type: ignore
 from langchain_core.documents import Document  # type: ignore
 from concurrent.futures import ThreadPoolExecutor, as_completed
@@ -331,14 +332,27 @@ def main() -> None:
     chunks = splitter.split_documents(all_docs)
     print(f"  Created {len(chunks)} chunks in {time.time() - start_split:.2f}s")
 
+    embedding_provider = os.getenv("EMBEDDING_PROVIDER", "fastembed").lower()
     embedding_model = os.getenv("EMBEDDING_MODEL")
-    print(f"\nUsing FastEmbed embeddings: {embedding_model}")
     
-    embeddings = FastEmbedEmbeddings(
-        model_name=embedding_model, 
-        max_length=512,
-        threads=4
-    )
+    print(f"\nSetting up embeddings...")
+    print(f"  Provider: {embedding_provider}")
+    print(f"  Model: {embedding_model}")
+    
+    if embedding_provider == "huggingface":
+        print("  Using HuggingFace embeddings (supports any model)")
+        embeddings = HuggingFaceEmbeddings(
+            model_name=embedding_model,
+            model_kwargs={'device': 'cpu'},  
+            encode_kwargs={'normalize_embeddings': True}
+        )
+    else:  
+        print("  Using FastEmbed embeddings (optimized, limited models)")
+        embeddings = FastEmbedEmbeddings(
+            model_name=embedding_model, 
+            max_length=512,
+            threads=4
+        )
 
     chroma_dir.mkdir(parents=True, exist_ok=True)
     print(f"\nBuilding Chroma index at: {chroma_dir}")
diff --git a/rag/query.py b/rag/query.py
index dcef13e..9002067 100644
--- a/rag/query.py
+++ b/rag/query.py
@@ -6,6 +6,7 @@ from typing import List
 from dotenv import load_dotenv
 from langchain_ollama import ChatOllama  # type: ignore
 from langchain_community.embeddings import FastEmbedEmbeddings  # type: ignore
+from langchain_huggingface import HuggingFaceEmbeddings  # type: ignore
 from langchain_chroma import Chroma  # type: ignore
 from langchain_core.documents import Document  # type: ignore
 from langchain_core.prompts import ChatPromptTemplate  # type: ignore
@@ -37,12 +38,31 @@ def format_docs(docs: List[Document]) -> str:
     return "\n\n".join(f"[Source: {d.metadata.get('source', 'unknown')}]\n{d.page_content}" for d in docs)
 
 
+def get_embeddings():
+    """Get embeddings based on the configured provider."""
+    embedding_provider = os.getenv("EMBEDDING_PROVIDER", "fastembed").lower()
+    embedding_model = os.getenv("EMBEDDING_MODEL")
+    
+    if embedding_provider == "huggingface":
+        return HuggingFaceEmbeddings(
+            model_name=embedding_model,
+            model_kwargs={'device': 'cpu'},
+            encode_kwargs={'normalize_embeddings': True}
+        )
+    else: 
+        return FastEmbedEmbeddings(
+            model_name=embedding_model,
+            max_length=512
+        )
+
+
 def get_llm():
     model_name = os.getenv("OLLAMA_MODEL")
     base_url = os.getenv("OLLAMA_BASE_URL")
     return ChatOllama(model=model_name, base_url=base_url), model_name
 
 
+
 def run_query_complete(query: str, provider: str = "ollama") -> tuple[str, str, list[str]]:
     """
     Run a complete query and return (answer, model_name, sources).
@@ -55,11 +75,7 @@ def run_query_complete(query: str, provider: str = "ollama") -> tuple[str, str,
     if not chroma_dir.exists():
         raise FileNotFoundError(f"Vector store directory not found: {chroma_dir}. Run ingestion first.")
     
-    embedding_model = os.getenv("EMBEDDING_MODEL")
-    embeddings = FastEmbedEmbeddings(
-        model_name=embedding_model,
-        max_length=512
-    )
+    embeddings = get_embeddings()
     
     vectorstore = Chroma(persist_directory=str(chroma_dir), embedding_function=embeddings)
     
@@ -75,10 +91,18 @@ def run_query_complete(query: str, provider: str = "ollama") -> tuple[str, str,
         compressor = Reranker(model_name=reranker_model, top_n=top_n)
         docs = compressor.compress_documents(docs, query)
 
-    # Use the EXACT same prompt as the terminal version for consistent quality
     prompt = ChatPromptTemplate.from_messages([
         ("system", """You are an expert technical assistant that provides extremely detailed, comprehensive, and in-depth answers based on the given context.
 
+🌍 LANGUAGE REQUIREMENT - CRITICAL:
+- ALWAYS respond in the SAME LANGUAGE as the question
+- If the question is in French, respond in French
+- If the question is in English, respond in English
+- If the question is in Spanish, respond in Spanish
+- Apply this to ANY language the user asks in
+- Maintain the SAME level of technical detail and quality regardless of language
+- Technical terms can remain in English if there's no standard translation, but explain them in the question's language
+
 CRITICAL INSTRUCTIONS - YOUR ANSWERS MUST BE DETAILED AND THOROUGH:
 
 📝 LENGTH & DEPTH REQUIREMENTS:
@@ -182,11 +206,7 @@ def main() -> None:
     print(f"Loading Chroma from: {chroma_dir}")
     
     start_time = time.time()
-    embedding_model = os.getenv("EMBEDDING_MODEL")
-    embeddings = FastEmbedEmbeddings(
-        model_name=embedding_model,
-        max_length=512
-    )
+    embeddings = get_embeddings()
     
     vectorstore = Chroma(persist_directory=str(chroma_dir), embedding_function=embeddings)
     print(f"Loaded in {time.time() - start_time:.2f}s")
@@ -210,6 +230,15 @@ def main() -> None:
     prompt = ChatPromptTemplate.from_messages([
         ("system", """You are an expert technical assistant that provides extremely detailed, comprehensive, and in-depth answers based on the given context.
 
+🌍 LANGUAGE REQUIREMENT - CRITICAL:
+- ALWAYS respond in the SAME LANGUAGE as the question
+- If the question is in French, respond in French
+- If the question is in English, respond in English
+- If the question is in Spanish, respond in Spanish
+- Apply this to ANY language the user asks in
+- Maintain the SAME level of technical detail and quality regardless of language
+- Technical terms can remain in English if there's no standard translation, but explain them in the question's language
+
 CRITICAL INSTRUCTIONS - YOUR ANSWERS MUST BE DETAILED AND THOROUGH:
 
 📝 LENGTH & DEPTH REQUIREMENTS:
diff --git a/requirements.txt b/requirements.txt
index 146b9fa..1a696bd 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,6 +2,7 @@ langchain>=1.0.8
 langchain-community>=0.4.1
 langchain-ollama>=1.0.0
 langchain-chroma>=0.1.2
+langchain-huggingface>=0.1.0
 chromadb>=1.3.5
 fastembed>=0.7.3
 python-dotenv>=1.2.1
-- 
GitLab


From 1b9d6894fe07a56c16af78ecfb3074cef590a37c Mon Sep 17 00:00:00 2001
From: Nozomu05 <tranminhhoanganh2005@gmail.com>
Date: Wed, 25 Feb 2026 17:35:15 +0100
Subject: [PATCH 2/2] ajout du multilingue avec l'ajout de QWEN sans
 utilisation de OLLAMA

---
 .env.example            | 183 ++++++++++++++++++-------
 .gitignore              |  10 +-
 README.md               | 292 ++++++++++++++++++++++++----------------
 frontend/app.py         |   3 +-
 rag/ingest.py           |  26 ++--
 rag/query.py            |  89 ++++++++++--
 rag/transformers_llm.py | 280 ++++++++++++++++++++++++++++++++++++++
 requirements.txt        |   7 +-
 8 files changed, 695 insertions(+), 195 deletions(-)
 create mode 100644 rag/transformers_llm.py

diff --git a/.env.example b/.env.example
index 88339bf..a712526 100644
--- a/.env.example
+++ b/.env.example
@@ -1,14 +1,86 @@
 ﻿# ============================================
 # MODEL PROVIDER CONFIGURATION
 # ============================================
-MODEL_PROVIDER=ollama
+LLM_PROVIDER=transformers
 
 # --------------------------------------------
-# OLLAMA SETTINGS (Local models: Qwen, Mistral, Llama, etc.)
+# TRANSFORMERS SETTINGS (HuggingFace models)
 # --------------------------------------------
-# CURRENT: Mid-end configuration (16GB RAM, mid-end GPU/CPU)
-OLLAMA_MODEL=qwen2.5:14b-instruct
-OLLAMA_BASE_URL=http://localhost:11434
+TRANSFORMERS_MODEL=Qwen/Qwen2.5-7B-Instruct
+MAX_NEW_TOKENS=2048
+TEMPERATURE=0.7
+# Model will download to ~/.cache/huggingface/ (about 28GB for 14B model)
+# QUANTIZATION: Reduces memory usage (important for GPUs with limited VRAM)
+#   4bit - Use 4-bit quantization (~3.5GB VRAM for 14B model, recommended for GPUs with <12GB VRAM)
+#   8bit - Use 8-bit quantization (~7GB VRAM for 14B model)
+#   none - No quantization (~14GB VRAM for 14B model, requires high-end GPU)
+QUANTIZATION=4bit
+# MAX_NEW_TOKENS: Increase for longer, more complete answers. 2048 allows detailed responses.
+
+# --------------------------------------------
+# DEVICE CONFIGURATION
+# --------------------------------------------
+# Control which device each component uses
+# Options:
+#   auto  - Auto-detect GPU and use it if available (recommended for LLM)
+#   cuda  - Force GPU usage (fastest, requires NVIDIA GPU with CUDA)
+#   cpu   - Force CPU usage (slower but works on any computer)
+#
+# Components:
+LLM_DEVICE=auto              # Qwen language model (main inference)
+EMBEDDING_DEVICE=cuda        # Document/query embeddings (10-50x faster on GPU)
+RERANKER_DEVICE=cuda         # Re-ranking model (improves result quality)
+#
+# Recommendations:
+#   - With GPU: Use cuda/auto for all (best performance)
+#   - CPU only: Set all to cpu
+#   - Limited GPU memory: LLM=cpu, EMBEDDING=cuda, RERANKER=cpu
+
+# ============================================
+# HARDWARE-BASED MODEL RECOMMENDATIONS
+# ============================================
+# Choose the configuration that matches your hardware:
+
+# Copy of .env for reference and quick setup; edit as needed.
+
+# ============================================
+# MODEL PROVIDER CONFIGURATION
+# ============================================
+LLM_PROVIDER=transformers
+
+# --------------------------------------------
+# TRANSFORMERS SETTINGS (HuggingFace models)
+# --------------------------------------------
+TRANSFORMERS_MODEL=Qwen/Qwen2.5-14B-Instruct
+MAX_NEW_TOKENS=4096
+TEMPERATURE=0
+LLM_SEED=42
+# Model will download to ~/.cache/huggingface/ (about 28GB for 14B model)
+# QUANTIZATION: Reduces memory usage (important for GPUs with limited VRAM)
+#   4bit - Use 4-bit quantization (~3.5GB VRAM for 14B model, recommended for GPUs with <12GB VRAM)
+#   8bit - Use 8-bit quantization (~7GB VRAM for 14B model)
+#   none - No quantization (~14GB VRAM for 14B model, requires high-end GPU)
+QUANTIZATION=4bit
+# MAX_NEW_TOKENS: Increase for longer, more complete answers. 4096 allows longer responses.
+
+# --------------------------------------------
+# DEVICE CONFIGURATION
+# --------------------------------------------
+# Control which device each component uses
+# Options:
+#   auto  - Auto-detect GPU and use it if available (recommended for LLM)
+#   cuda  - Force GPU usage (fastest, requires NVIDIA GPU with CUDA)
+#   cpu   - Force CPU usage (slower but works on any computer)
+#
+# Components:
+LLM_DEVICE=auto              # Qwen language model (main inference)
+EMBEDDING_DEVICE=cuda        # Document/query embeddings (10-50x faster on GPU)
+RERANKER_DEVICE=cuda         # Re-ranking model (improves result quality)
+#
+# Recommendations:
+#   - With GPU: Use cuda/auto for all (best performance)
+#   - CPU only: Set all to cpu
+#   - Limited GPU memory: LLM=cpu, EMBEDDING=cuda, RERANKER=cpu
 
 # ============================================
 # HARDWARE-BASED MODEL RECOMMENDATIONS
@@ -17,34 +89,33 @@ OLLAMA_BASE_URL=http://localhost:11434
 
 # --- [1] LOW-END CONFIG (8GB RAM, low-end GPU, low-end CPU) ---
 # Best for: Budget laptops, older computers, testing
-# OLLAMA_MODEL=qwen2.5:3b-instruct
-# Alternative: qwen2.5:7b-instruct (if you can spare the RAM)
+# TRANSFORMERS_MODEL=Qwen/Qwen2.5-3B-Instruct
+# Alternative: Qwen/Qwen2.5-7B-Instruct (if you can spare the RAM)
 # Expected speed: 5-10 seconds/query (CPU), 1-3 seconds (GPU)
 # RAM usage: ~4-6GB
 
 # --- [2] MID-END CONFIG (16GB RAM, mid-end GPU, mid-end CPU) --- ✅ CURRENT
 # Best for: Modern laptops, standard workstations
-# OLLAMA_MODEL=qwen2.5:14b-instruct
-# Alternative: qwen2.5:7b-instruct (faster)
+# TRANSFORMERS_MODEL=Qwen/Qwen2.5-14B-Instruct
+# Alternative: Qwen/Qwen2.5-7B-Instruct (faster)
 # Expected speed: 8-15 seconds/query (CPU), 2-4 seconds (GPU)
 # RAM usage: ~10-14GB
 
 # --- [3] BETTER MID-END CONFIG (16GB RAM, modern GPU, modern CPU) ---
 # Best for: Gaming PCs, modern workstations with RTX/RX GPUs
-# OLLAMA_MODEL=qwen2.5:14b-instruct
-# Alternative: qwen2.5:32b-instruct (if GPU has 12GB+ VRAM)
+# TRANSFORMERS_MODEL=Qwen/Qwen2.5-14B-Instruct
+# Alternative: Qwen/Qwen2.5-32B-Instruct (if GPU has 12GB+ VRAM)
 # Expected speed: 1-2 seconds/query (good GPU), 6-10 seconds (CPU)
 # RAM usage: ~12-16GB
-# Note: Modern GPU makes huge difference, can handle 32b with quantization
+# Note: Modern GPU makes huge difference, can handle 32B models
 
 # --- [4] HIGH-END CONFIG (32GB+ RAM, high-end GPU, high-end CPU) ---
 # Best for: Workstations, servers, RTX 4090/A6000, ThreadRipper/Xeon
-# OLLAMA_MODEL=qwen2.5:32b-instruct
+# TRANSFORMERS_MODEL=Qwen/Qwen2.5-32B-Instruct
 # Alternatives:
-#   qwen2.5:72b-instruct    (64GB+ RAM)
-#   qwen2.5:110b-instruct   (80-128GB RAM, ultimate quality)
+#   Qwen/Qwen2.5-72B-Instruct    (64GB+ RAM)
 # Expected speed: 0.5-1.5 seconds/query (high-end GPU), 15-30 seconds (CPU)
-# RAM usage: ~24-40GB (32b), ~50-80GB (72b), ~90-120GB (110b)
+# RAM usage: ~24-40GB (32B), ~50-80GB (72B)
 
 # ============================================
 # DOCUMENT PROCESSING
@@ -93,6 +164,50 @@ EMBEDDING_PROVIDER=huggingface
 # CURRENT: Mid-end configuration (multilingual)
 EMBEDDING_MODEL=intfloat/multilingual-e5-large-instruct
 
+# ============================================
+# HOW TO SWITCH EMBEDDING PROVIDERS
+# ============================================
+# 
+# OPTION 1: FastEmbed (faster, optimized, limited model support)
+*** End Patch
+# --- CHUNKING CONFIGURATION ---
+# How documents are split affects answer quality!
+# Chunk size = characters per chunk | Overlap = shared chars between adjacent chunks
+
+# CURRENT: General technical documents (good balance)
+CHUNK_SIZE=800          # ~150-200 words, 2-3 paragraphs
+CHUNK_OVERLAP=100       # 12.5% overlap prevents context loss at boundaries
+
+# ADJUST BASED ON YOUR DOCUMENT TYPE:
+# Dense technical specs/standards (MPEG, ISO, etc.):
+#   CHUNK_SIZE=1000-1200, CHUNK_OVERLAP=150-200
+#   → Preserves complete technical descriptions, tables, multi-paragraph explanations
+#
+# Short Q&A, FAQs, snippets:
+#   CHUNK_SIZE=500-600, CHUNK_OVERLAP=75-100
+#   → More precision, faster retrieval, good for focused answers
+#
+# Long-form articles, research papers:
+#   CHUNK_SIZE=1500-2000, CHUNK_OVERLAP=300-400
+#   → Preserves narrative flow, complete arguments, methodology sections
+#
+# Mixed document collection:
+#   CHUNK_SIZE=800-1000, CHUNK_OVERLAP=120-150
+#   → Balanced for various content types
+#
+# NOTE: Changing these requires re-ingestion: python rag/ingest.py
+
+# ============================================
+# EMBEDDING MODEL CONFIGURATION
+# ============================================
+
+# --- EMBEDDING PROVIDER SELECTION ---
+# Choose: "fastembed" (faster, limited models) or "huggingface" (flexible, any model)
+EMBEDDING_PROVIDER=huggingface
+
+# CURRENT: Mid-end configuration (multilingual)
+EMBEDDING_MODEL=intfloat/multilingual-e5-large-instruct
+
 # ============================================
 # HOW TO SWITCH EMBEDDING PROVIDERS
 # ============================================
@@ -130,7 +245,7 @@ EMBEDDING_MODEL=intfloat/multilingual-e5-large-instruct
 # Quality: Good for most languages
 # Ingestion speed: ~300-500 docs/minute
 # Provider: ✅ Both FastEmbed and HuggingFace
-
+#
 # --- [2] MID-END CONFIG (16GB RAM, mid-end GPU, mid-end CPU) --- 
 # Best for: Multilingual document collections, good balance
 # EMBEDDING_PROVIDER=fastembed
@@ -149,7 +264,7 @@ EMBEDDING_MODEL=intfloat/multilingual-e5-large-instruct
 # Quality: Excellent for English
 # Ingestion speed: ~250-400 docs/minute
 # Provider: ✅ FastEmbed supported
-
+#
 # --- [3] BETTER MID-END CONFIG (16GB RAM, modern GPU, modern CPU) --- ✅ CURRENT
 # Best for: High-quality multilingual RAG, modern systems
 # Option A: Use FastEmbed (faster)
@@ -165,7 +280,7 @@ EMBEDDING_MODEL=intfloat/multilingual-e5-large-instruct
 # RAM usage: ~2.5GB
 # Multilingual: ✅ Excellent (100+ languages)
 # Note: E5-large-instruct handles technical jargon and complex questions better
-
+#
 # --- [4] HIGH-END CONFIG (32GB+ RAM, high-end GPU, high-end CPU) ---
 # Best for: Maximum quality, production systems, critical applications
 # EMBEDDING_PROVIDER=huggingface
@@ -182,7 +297,7 @@ EMBEDDING_MODEL=intfloat/multilingual-e5-large-instruct
 # EMBEDDING_PROVIDER=fastembed
 # EMBEDDING_MODEL=BAAI/bge-large-en-v1.5
 # English-only but highest quality for English technical documents
-
+#
 # ============================================
 # RETRIEVAL & RERANKING CONFIGURATION
 # ============================================
@@ -212,7 +327,7 @@ RERANKER_MODEL=BAAI/bge-reranker-v2-m3
 # Query overhead: +0.3-0.5 seconds
 # RAM usage: ~1-1.5GB
 # Multilingual: ⚠️ English-focused, acceptable for others
-
+#
 # --- [2] MID-END CONFIG (16GB RAM, mid-end GPU, mid-end CPU) --- ✅ CURRENT
 # Best for: Multilingual systems, good quality/speed balance
 # RERANKER_MODEL=BAAI/bge-reranker-v2-m3
@@ -227,29 +342,5 @@ RERANKER_MODEL=BAAI/bge-reranker-v2-m3
 # RERANKER_MODEL=BAAI/bge-reranker-large
 # Specs: 560M params, English-only, similar speed to v2-m3
 # Quality: Excellent for English
-
-# --- [3] BETTER MID-END CONFIG (16GB RAM, modern GPU, modern CPU) ---
-# Best for: Modern systems with good CPU, multilingual quality
-# RERANKER_MODEL=BAAI/bge-reranker-v2-m3
-# Specs: Same as mid-end, but modern CPU handles it faster
-# Quality: Excellent
-# Query overhead: +0.3-0.6 seconds (faster CPU)
-# RAM usage: ~2-3GB
-# Top N: 10-12 recommended (can afford more reranking)
-# Note: Good CPU makes reranking much faster, can increase TOP_N_RERANK
-
-# --- [4] HIGH-END CONFIG (32GB+ RAM, high-end GPU, high-end CPU) ---
-# Best for: Maximum accuracy, production systems
-# RERANKER_MODEL=BAAI/bge-reranker-v2-minicpm-layerwise
-# Specs: 2.4B params, state-of-the-art quality, multilingual
-# Quality: Best available (4x more parameters than v2-m3)
-# Query overhead: +1.5-3 seconds (3-5x slower than v2-m3)
-# RAM usage: ~8-16GB (just for reranker!)
-# Multilingual: ✅ Excellent (100+ languages)
-# Top N: 12-15 recommended
-# Best for: Critical applications where accuracy > speed
 #
-# Alternative (balanced high-end):
-# RERANKER_MODEL=BAAI/bge-reranker-v2-m3
-# Use current model but with higher TOP_N_RERANK=15
-# Faster queries while maintaining excellent quality
+```
diff --git a/.gitignore b/.gitignore
index e337265..b077bd0 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,6 +1,7 @@
 # Python
 .venv/
 venv/
+QWEN/
 __pycache__/
 *.py[cod]
 *$py.class
@@ -39,13 +40,18 @@ wheels/
 Thumbs.db
 
 # Project specific
-storage/chroma/
-docs/
+ 
 storage/.ingest_cache.json
 ingestion_errors.log
 
 # Office temporary files
 ~$*
+# Ignore everything inside docs/ but not the folder itself
+docs/*
+!docs/
+# Ignore everything inside storage/ but not the folder itself
+storage/*
+!storage/
 *.tmp
 
 
diff --git a/README.md b/README.md
index 830b1d9..3837d1b 100644
--- a/README.md
+++ b/README.md
@@ -1,22 +1,22 @@
 # RAG System with Qwen
 
-A Retrieval-Augmented Generation (RAG) system that lets you query your documents using Ollama and Qwen models locally.
+A Retrieval-Augmented Generation (RAG) system that lets you query your documents using Qwen models from HuggingFace Transformers locally.
 
 ---
 
 ## Table of Contents
 
 - [Installation and Setup](#installation-and-setup)
-  - [Step 1: Install Ollama](#step-1-install-ollama)
-  - [Step 2: Pull Required Ollama Models](#step-2-pull-required-ollama-models)
-  - [Step 3: Enable CPU-Only Mode (For Low-End Computers)](#step-3-enable-cpu-only-mode-for-low-end-computers)
-  - [Step 4: Install Pandoc (Optional)](#step-4-install-pandoc-optional)
-  - [Step 5: Setup Python Environment](#step-5-setup-python-environment)
-  - [Step 6: Configure Environment](#step-6-configure-environment)
-  - [Step 7: Add Your Documents](#step-7-add-your-documents)
-  - [Step 8: Ingest Documents](#step-8-ingest-documents)
-  - [Step 9: Start the Frontend](#step-9-start-the-frontend)
+  - [Linux Prerequisites](#linux-prerequisites)
+  - [Step 1: Setup Python Environment](#step-1-setup-python-environment)
+  - [Step 2: Install Pandoc (Optional)](#step-2-install-pandoc-optional)
+  - [Step 3: Configure Environment](#step-3-configure-environment)
+  - [Step 4: Add Your Documents](#step-4-add-your-documents)
+  - [Step 5: Ingest Documents](#step-5-ingest-documents)
+  - [Step 6: Start the Frontend](#step-6-start-the-frontend)
 - [Command-Line Query (Optional)](#command-line-query-optional)
+  - [Interactive Mode (Recommended)](#interactive-mode-recommended)
+  - [Single Query Mode](#single-query-mode)
 - [Performance Notes](#performance-notes)
   - [CPU vs GPU Mode](#cpu-vs-gpu-mode)
   - [Model Recommendations by Hardware](#model-recommendations-by-hardware)
@@ -43,86 +43,72 @@ A Retrieval-Augmented Generation (RAG) system that lets you query your documents
 
 ## Installation and Setup
 
-### Step 1: Install Ollama
+### Linux Prerequisites
 
-**Windows (via Winget):**
-```powershell
-winget install Ollama.Ollama -e
-```
+**For Ubuntu/Debian-based distributions:**
+```bash
+# Update package list
+sudo apt update
 
-Verify installation:
-```powershell
-ollama --version
-```
+# Install Python 3.10+ and pip
+sudo apt install python3 python3-pip python3-venv
 
-Ollama runs as a Windows service automatically. If not running:
-```powershell
-ollama serve
+# Install development tools (required for some Python packages)
+sudo apt install build-essential python3-dev
 ```
 
-**macOS (via Homebrew):**
+**For Fedora/RHEL/CentOS:**
 ```bash
-brew install ollama
-```
+# Install Python 3.10+ and pip
+sudo dnf install python3 python3-pip
 
-Verify installation:
-```bash
-ollama --version
+# Install development tools
+sudo dnf groupinstall "Development Tools"
+sudo dnf install python3-devel
 ```
 
-Start Ollama service:
+**For Arch Linux:**
 ```bash
-ollama serve
-```
-
-**macOS (Manual Download):**
-Download from [https://ollama.ai/download](https://ollama.ai/download) and install the .dmg file.
+# Install Python and pip
+sudo pacman -S python python-pip
 
-### Step 2: Pull Required Ollama Models
-
-**LLM Model (for answering queries):**
-```bash
-ollama pull qwen2.5:14b-instruct
+# Install base development tools
+sudo pacman -S base-devel
 ```
 
-**Embedding Model (for semantic search):**
+**Verify Python installation:**
 ```bash
-ollama pull mxbai-embed-large
+python3 --version  # Should be 3.10 or higher
+pip3 --version
 ```
 
-**Note for low-end computers:** The 14b model requires ~16GB RAM. If you have less RAM, use:
-```bash
-ollama pull qwen2.5:7b-instruct  # Requires ~8GB RAM
-```
-
-### Step 3: Enable CPU-Only Mode (For Low-End Computers)
-
-**If you have a low-end computer or insufficient GPU memory**, force Ollama to run on CPU only:
+### Step 1: Setup Python Environment
 
 **Windows:**
 ```powershell
-[System.Environment]::SetEnvironmentVariable('OLLAMA_NUM_GPU', '0', 'User')
-$env:OLLAMA_NUM_GPU = '0'
+python -m venv .venv
+.\.venv\Scripts\Activate.ps1
+```
+
+**Note:** If you get an execution policy error:
+```powershell
+Set-ExecutionPolicy -ExecutionPolicy RemoteSigned -Scope CurrentUser
 ```
 
 **macOS/Linux:**
 ```bash
-echo 'export OLLAMA_NUM_GPU=0' >> ~/.bashrc  # or ~/.zshrc for zsh
-source ~/.bashrc  # or source ~/.zshrc
+python3 -m venv .venv
+source .venv/bin/activate
 ```
 
-Restart your terminal after setting this. The model will run slower but work on any computer.
-
-**To re-enable GPU later (if you upgrade hardware):**
-```powershell
-# Windows
-[System.Environment]::SetEnvironmentVariable('OLLAMA_NUM_GPU', '1', 'User')
-```
+**Install dependencies (all platforms):**
 ```bash
-# macOS/Linux - remove the line from ~/.bashrc or ~/.zshrc
+pip install -r requirements.txt
 ```
 
-### Step 4: Install Pandoc (Optional)
+**Note:** The first time you run a query, the Qwen model (~28GB for 14B model) will download automatically to `~/.cache/huggingface/`. This may take some time depending on your internet connection.
+
+### Step 2: Install Pandoc (Optional)
 
 Only needed if you have OpenDocument (.odt) files:
 
@@ -136,31 +122,20 @@ winget install --id JohnMacFarlane.Pandoc -e
 brew install pandoc
 ```
 
-### Step 5: Setup Python Environment
-
-**Windows:**
-```powershell
-python -m venv .venv
-.\.venv\Scripts\Activate.ps1
-```
-
-**Note:** If you get an execution policy error:
-```powershell
-Set-ExecutionPolicy -ExecutionPolicy RemoteSigned -Scope CurrentUser
-```
-
-**macOS/Linux:**
+**Linux:**
 ```bash
-python3 -m venv .venv
-source .venv/bin/activate
-```
+# Ubuntu/Debian
+sudo apt update
+sudo apt install pandoc
 
-**Install dependencies (all platforms):**
-```bash
-pip install -r requirements.txt
+# Fedora/RHEL/CentOS
+sudo dnf install pandoc
+
+# Arch Linux
+sudo pacman -S pandoc
 ```
 
-### Step 6: Configure Environment
+### Step 3: Configure Environment
 
 **Windows:**
 ```powershell
@@ -175,27 +150,39 @@ cp .env.example .env
 Edit `.env` with your settings:
 ```env
 # Model Configuration
-OLLAMA_MODEL=qwen2.5:14b-instruct
-OLLAMA_BASE_URL=http://localhost:11434
-EMBEDDING_MODEL=mxbai-embed-large
+LLM_PROVIDER=transformers
+TRANSFORMERS_MODEL=Qwen/Qwen2.5-14B-Instruct
+MAX_NEW_TOKENS=4096
+TEMPERATURE=0
+LLM_SEED=42
+QUANTIZATION=4bit
+
+# Device Configuration (auto, cuda, or cpu)
+LLM_DEVICE=auto
+EMBEDDING_DEVICE=cuda
+RERANKER_DEVICE=cuda
+
+# Embedding Configuration
+EMBEDDING_PROVIDER=huggingface
+EMBEDDING_MODEL=intfloat/multilingual-e5-large-instruct
 
 # Retrieval Settings
 RETRIEVAL_CHUNKS=100
-TOP_N_RERANK=15
+TOP_N_RERANK=8
 USE_RERANKING=true
 
 # Document Processing
 CHUNK_SIZE=800
-CHUNK_OVERLAP=160
+CHUNK_OVERLAP=100
 ```
 
-**Note:** If using 7b model on low-end computer, change to `OLLAMA_MODEL=qwen2.5:7b-instruct`
+**Note:** If using a lower-spec computer, change to `TRANSFORMERS_MODEL=Qwen/Qwen2.5-7B-Instruct` for faster performance. If you don't have a GPU, set all device settings to `cpu`.
 
-### Step 7: Add Your Documents
+### Step 4: Add Your Documents
 
 Place your documents (Word, PDF, PowerPoint, Text, Markdown, etc.) in the `docs/` folder.
 
-### Step 8: Ingest Documents
+### Step 5: Ingest Documents
 
 Run the ingestion script to process your documents:
 
@@ -215,7 +202,7 @@ This will:
 - Generate embeddings
 - Store vectors in the database
 
-### Step 9: Start the Frontend
+### Step 6: Start the Frontend
 
 Start the web interface:
 
@@ -237,25 +224,99 @@ Open this URL in your browser to start querying your documents!
 
 ## Command-Line Query (Optional)
 
-You can also run queries directly from the command line:
+### Interactive Mode (Recommended)
+
+For multiple queries without reloading the model each time:
+
+**macOS/Linux:**
+```bash
+python rag/query_interactive.py
+```
 
 **Windows:**
 ```powershell
-python rag\query.py "Your question here"
+python rag\query_interactive.py
+```
+
+This loads the model **once** and keeps it in memory. You can then ask multiple questions without the 15-second checkpoint loading delay.
+
+**Example session:**
 ```
+Query: What is V-PCC?
+[Answer streams in real-time...]
+
+Query: How does it compare to G-PCC?
+[Answer streams immediately - no reload!]
+
+Query: quit
+```
+
+### Single Query Mode
+
+For one-off queries from the command line:
 
 **macOS/Linux:**
 ```bash
 python rag/query.py "Your question here"
 ```
 
+**Windows:**
+```powershell
+python rag\query.py "Your question here"
+```
+
+**Note:** This reloads the model each time (~15s startup)
+
 ---
 
 ## Performance Notes
 
 ### CPU vs GPU Mode
-- **GPU Mode (default):** Fast responses (1-2 seconds with 14b model)
-- **CPU-Only Mode:** Slower responses (8-15 seconds with 14b model) but works on any computer
+
+The system can run on either CPU or GPU for optimal performance. You can configure which device each component uses in your `.env` file:
+
+```env
+# Device configuration
+# Options: auto (auto-detect GPU), cuda (force GPU), cpu (force CPU)
+LLM_DEVICE=auto              # Qwen language model
+EMBEDDING_DEVICE=cuda        # Document/query embeddings
+RERANKER_DEVICE=cuda         # Re-ranking model
+```
+
+**Device Options:**
+- `auto` - Automatically detects and uses GPU if available (recommended for LLM)
+- `cuda` - Forces GPU usage (fastest, requires NVIDIA GPU with CUDA)
+- `cpu` - Forces CPU usage (slower but works on any computer)
+
+**Performance Comparison (14B model):**
+- **GPU Mode (cuda):** Fast responses (1-2 seconds)
+- **CPU-Only Mode (cpu):** Slower responses (8-15 seconds) but works on any computer
+- **Auto Mode (auto):** Best of both worlds - uses GPU if available, falls back to CPU
+
+**Recommended Configurations:**
+
+*For systems with NVIDIA GPU:*
+```env
+LLM_DEVICE=auto              # Use GPU if available
+EMBEDDING_DEVICE=cuda        # Embeddings are 10-50x faster on GPU
+RERANKER_DEVICE=cuda         # Re-ranking is faster on GPU
+```
+
+*For CPU-only systems (no GPU):*
+```env
+LLM_DEVICE=cpu
+EMBEDDING_DEVICE=cpu
+RERANKER_DEVICE=cpu
+```
+
+*For systems with limited GPU memory:*
+```env
+LLM_DEVICE=cpu               # Save GPU memory
+EMBEDDING_DEVICE=cuda        # Embeddings use less memory
+RERANKER_DEVICE=cpu          # Only when needed
+```
+
+**Note:** After changing device settings, restart the application for changes to take effect. Re-ingestion is not required unless you change `EMBEDDING_DEVICE` after already ingesting documents.
 
 ### Model Recommendations by Hardware
 
@@ -349,26 +410,23 @@ No re-ingestion needed, changes apply immediately!
 - Use case: Complex reasoning, technical documents
 - Requirements: 32GB+ RAM recommended
 
-**Option 2 - Maximum Quality:** `qwen2.5:72b-instruct` (72B params, 48GB VRAM/64GB RAM)
+**Option 2 - Maximum Quality:** `Qwen/Qwen2.5-72B-Instruct` (72B params, 48GB VRAM/64GB RAM)
 - Speed: ⚡⚡ Slow (5x slower)
 - Quality: ⭐⭐⭐⭐⭐ Best available
 - Use case: Research, critical analysis, highest accuracy
 - Requirements: 64GB+ RAM, powerful hardware
 
-**Option 3 - Faster Lightweight:** `qwen2.5:7b-instruct` (7B params, 4GB VRAM/8GB RAM)
+**Option 3 - Faster Lightweight:** `Qwen/Qwen2.5-7B-Instruct` (7B params, 4GB VRAM/8GB RAM)
 - Speed: ⚡⚡⚡⚡⚡ Very Fast (2x faster)
 - Quality: ⭐⭐⭐ Good
 - Use case: Low-end hardware, quick responses
 
 **To upgrade LLM:**
-```bash
-# Pull new model
-ollama pull qwen2.5:32b-instruct
-
+```env
 # Update .env
-OLLAMA_MODEL=qwen2.5:32b-instruct
+TRANSFORMERS_MODEL=Qwen/Qwen2.5-32B-Instruct
 ```
-No re-ingestion needed!
+The new model will download automatically on first use. No re-ingestion needed!
 
 ### 🏆 **Recommended Production Configurations**
 
@@ -376,7 +434,7 @@ No re-ingestion needed!
 ```env
 EMBEDDING_MODEL=BAAI/bge-base-en-v1.5
 RERANKER_MODEL=BAAI/bge-reranker-base
-OLLAMA_MODEL=qwen2.5:14b-instruct
+TRANSFORMERS_MODEL=Qwen/Qwen2.5-14B-Instruct
 ```
 - **Speed:** Fast
 - **Quality:** Very Good
@@ -387,7 +445,7 @@ OLLAMA_MODEL=qwen2.5:14b-instruct
 ```env
 EMBEDDING_MODEL=BAAI/bge-large-en-v1.5
 RERANKER_MODEL=BAAI/bge-reranker-v2-m3
-OLLAMA_MODEL=qwen2.5:32b-instruct
+TRANSFORMERS_MODEL=Qwen/Qwen2.5-32B-Instruct
 ```
 - **Speed:** Moderate
 - **Quality:** Excellent
@@ -398,7 +456,7 @@ OLLAMA_MODEL=qwen2.5:32b-instruct
 ```env
 EMBEDDING_MODEL=sentence-transformers/all-MiniLM-L6-v2
 RERANKER_MODEL=BAAI/bge-reranker-base
-OLLAMA_MODEL=qwen2.5:14b-instruct
+TRANSFORMERS_MODEL=Qwen/Qwen2.5-14B-Instruct
 ```
 - **Speed:** Very Fast
 - **Quality:** Good
@@ -409,7 +467,7 @@ OLLAMA_MODEL=qwen2.5:14b-instruct
 ```env
 EMBEDDING_MODEL=BAAI/bge-m3
 RERANKER_MODEL=BAAI/bge-reranker-v2-m3
-OLLAMA_MODEL=qwen2.5:14b-instruct
+TRANSFORMERS_MODEL=Qwen/Qwen2.5-14B-Instruct
 ```
 - **Speed:** Moderate
 - **Quality:** Excellent
@@ -489,18 +547,18 @@ RERANKER_MODEL=BAAI/bge-reranker-v2-m3
 - The prompt automatically instructs it to respond in the question's language
 
 **Alternative Multilingual LLMs:**
-```bash
-ollama pull qwen2.5:32b-instruct    # Best multilingual quality
-ollama pull llama3.1:8b             # Good for European languages
-ollama pull mistral:7b-instruct     # Good for French/English
+```env
+# In .env file
+TRANSFORMERS_MODEL=Qwen/Qwen2.5-14B-Instruct    # Excellent for 100+ languages
+TRANSFORMERS_MODEL=Qwen/Qwen2.5-32B-Instruct    # Best multilingual quality
+# Other alternatives:
+# TRANSFORMERS_MODEL=meta-llama/Llama-3.1-8B-Instruct  # Good for European languages
 ```
 
 **Why it matters:**
 - Even with perfect retrieval, if LLM doesn't support the language, answers will be poor or in wrong language
 - Qwen models are already excellent for multilingual - upgrading mainly improves reasoning depth
 
-**⚠️ Requires re-ingestion:** NO - just update `.env` and restart
-
 ---
 
 ### Current System Multilingual Capability
@@ -509,7 +567,7 @@ ollama pull mistral:7b-instruct     # Good for French/English
 |-----------|---------------|---------------|------------------------|
 | **Embedding** | all-MiniLM-L6-v2 | ❌ English-only | 🔴 **Poor retrieval** for non-English questions |
 | **Reranker** | bge-reranker-base | ⚠️ English-focused | 🟡 **Suboptimal ranking** for non-English |
-| **LLM** | qwen2.5:14b-instruct | ✅ Excellent | ✅ **Perfect responses** in any language |
+| **LLM** | Qwen2.5-14B-Instruct | ✅ Excellent | ✅ **Perfect responses** in any language |
 
 **Result:** The LLM **CAN respond** in French/Spanish/etc., but will work with **lower-quality context** retrieved by English-only embeddings.
 
@@ -523,7 +581,7 @@ ollama pull mistral:7b-instruct     # Good for French/English
 # In .env file
 EMBEDDING_MODEL=BAAI/bge-m3
 RERANKER_MODEL=BAAI/bge-reranker-v2-m3
-OLLAMA_MODEL=qwen2.5:14b-instruct
+TRANSFORMERS_MODEL=Qwen/Qwen2.5-14B-Instruct
 ```
 
 **Steps:**
@@ -662,3 +720,5 @@ CHUNK_OVERLAP=150
 | 1500/300 | ~15,000 | Slower | Most Complete |
 
 **Rule of Thumb:** Overlap should be 10-20% of chunk size for optimal results.
+
+---
diff --git a/frontend/app.py b/frontend/app.py
index 3a23d60..b6a8523 100644
--- a/frontend/app.py
+++ b/frontend/app.py
@@ -64,8 +64,7 @@ class Handler(BaseHTTPRequestHandler):
             if not question:
                 self._send(400, json.dumps({"error": "Question is required"}))
                 return
-            provider = os.getenv("MODEL_PROVIDER", "ollama").lower()
-            answer, model_name, sources = run_query_complete(question, provider)
+            answer, model_name, sources = run_query_complete(question)
             self._send(200, json.dumps({"answer": answer, "model": model_name, "sources": sources}))
         except Exception as exc:
             import traceback
diff --git a/rag/ingest.py b/rag/ingest.py
index bf580b4..b6dcb43 100644
--- a/rag/ingest.py
+++ b/rag/ingest.py
@@ -8,7 +8,7 @@ from typing import List
 import re
 import xml.etree.ElementTree as ET
 
-from langchain_community.document_loaders import (  # type: ignore
+from langchain_community.document_loaders import (
     DirectoryLoader, 
     TextLoader, 
     UnstructuredPowerPointLoader, 
@@ -19,28 +19,27 @@ from langchain_community.document_loaders import (  # type: ignore
     UnstructuredMarkdownLoader,
     UnstructuredWordDocumentLoader
 )
-from langchain_text_splitters import RecursiveCharacterTextSplitter  # type: ignore
-from langchain_community.embeddings import FastEmbedEmbeddings  # type: ignore
-from langchain_huggingface import HuggingFaceEmbeddings  # type: ignore
-from langchain_chroma import Chroma  # type: ignore
-from langchain_core.documents import Document  # type: ignore
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+from langchain_community.embeddings import FastEmbedEmbeddings
+from langchain_huggingface import HuggingFaceEmbeddings
+from langchain_chroma import Chroma
+from langchain_core.documents import Document
 from concurrent.futures import ThreadPoolExecutor, as_completed
 import logging
 from datetime import datetime
 
 try:
-    from docx import Document as DocxDocument  # type: ignore
+    from docx import Document as DocxDocument
     PYTHON_DOCX_AVAILABLE = True
 except ImportError:
     PYTHON_DOCX_AVAILABLE = False
 
 try:
-    from docx import Document as DocxDocument  # type: ignore
+    from docx import Document as DocxDocument
     PYTHON_DOCX_AVAILABLE = True
 except ImportError:
     PYTHON_DOCX_AVAILABLE = False
 
-
 def load_docx_with_python_docx(file_path: str) -> List[Document]:
     if not PYTHON_DOCX_AVAILABLE:
         raise ImportError("python-docx not available")
@@ -50,7 +49,6 @@ def load_docx_with_python_docx(file_path: str) -> List[Document]:
         raise ValueError("No text extracted")
     return [Document(page_content=text, metadata={"source": Path(file_path).name})]
 
-
 def load_docx_raw_xml(file_path: str) -> List[Document]:
     with zipfile.ZipFile(file_path, 'r') as docx_zip:
         try:
@@ -68,7 +66,6 @@ def load_docx_raw_xml(file_path: str) -> List[Document]:
         except Exception as e:
             raise ValueError(f"Failed to extract from XML: {e}")
 
-
 def extract_zip_files(docs_dir: Path) -> None:
     zip_files = list(docs_dir.glob("**/*.zip"))
     
@@ -129,7 +126,6 @@ def extract_zip_files(docs_dir: Path) -> None:
     
     print()
 
-
 def load_documents_batch(docs_dir: Path, batch_size: int = 50) -> tuple[List[Document], dict]:
     all_docs = []
     stats = {
@@ -265,7 +261,6 @@ def load_documents_batch(docs_dir: Path, batch_size: int = 50) -> tuple[List[Doc
     
     return all_docs, stats
 
-
 def main() -> None:
     load_dotenv()
 
@@ -334,16 +329,18 @@ def main() -> None:
 
     embedding_provider = os.getenv("EMBEDDING_PROVIDER", "fastembed").lower()
     embedding_model = os.getenv("EMBEDDING_MODEL")
+    embedding_device = os.getenv("EMBEDDING_DEVICE", "cuda").lower()
     
     print(f"\nSetting up embeddings...")
     print(f"  Provider: {embedding_provider}")
     print(f"  Model: {embedding_model}")
+    print(f"  Device: {embedding_device}")
     
     if embedding_provider == "huggingface":
         print("  Using HuggingFace embeddings (supports any model)")
         embeddings = HuggingFaceEmbeddings(
             model_name=embedding_model,
-            model_kwargs={'device': 'cpu'},  
+            model_kwargs={'device': embedding_device},  
             encode_kwargs={'normalize_embeddings': True}
         )
     else:  
@@ -406,6 +403,5 @@ def main() -> None:
     
     print(f"\n✅ Vector store is ready for querying")
 
-
 if __name__ == "__main__":
     main()
diff --git a/rag/query.py b/rag/query.py
index 9002067..3d748ed 100644
--- a/rag/query.py
+++ b/rag/query.py
@@ -4,7 +4,10 @@ from pathlib import Path
 from typing import List
 
 from dotenv import load_dotenv
-from langchain_ollama import ChatOllama  # type: ignore
+try:
+    from .transformers_llm import TransformersLLM  # type: ignore
+except Exception:
+    from transformers_llm import TransformersLLM  # type: ignore
 from langchain_community.embeddings import FastEmbedEmbeddings  # type: ignore
 from langchain_huggingface import HuggingFaceEmbeddings  # type: ignore
 from langchain_chroma import Chroma  # type: ignore
@@ -12,6 +15,17 @@ from langchain_core.documents import Document  # type: ignore
 from langchain_core.prompts import ChatPromptTemplate  # type: ignore
 from sentence_transformers import CrossEncoder
 import time
+import hashlib
+import random
+
+try:
+    import numpy as _np
+except Exception:
+    _np = None
+try:
+    import torch as _torch
+except Exception:
+    _torch = None
 
 
 class Reranker:
@@ -57,9 +71,41 @@ def get_embeddings():
 
 
 def get_llm():
-    model_name = os.getenv("OLLAMA_MODEL")
-    base_url = os.getenv("OLLAMA_BASE_URL")
-    return ChatOllama(model=model_name, base_url=base_url), model_name
+
+    load_dotenv()
+
+    model_name = os.getenv("TRANSFORMERS_MODEL")
+    quantization = os.getenv("QUANTIZATION", "none")
+    device = os.getenv("LLM_DEVICE", "auto")
+    seed = int(os.getenv("LLM_SEED", "0"))
+
+    if seed and seed > 0:
+        random.seed(seed)
+        if _np is not None:
+            _np.random.seed(seed)
+        if _torch is not None:
+            try:
+                _torch.manual_seed(seed)
+                _torch.cuda.manual_seed_all(seed)
+                _torch.use_deterministic_algorithms(True)
+            except Exception:
+                pass
+
+    global _LLM_CACHE
+    try:
+        _LLM_CACHE
+    except NameError:
+        _LLM_CACHE = {}
+
+    cache_key = (model_name, quantization, device)
+    if cache_key in _LLM_CACHE:
+        return _LLM_CACHE[cache_key], _LLM_CACHE[cache_key].model_name
+
+
+    llm = TransformersLLM()
+
+    _LLM_CACHE[cache_key] = llm
+    return llm, llm.model_name
 
 
 
@@ -176,15 +222,24 @@ Write your answer now (aim for 300-500+ words with extensive technical detail):"
 
     llm, model_name = get_llm()
     chain = prompt | llm
-    
+
     context_text = format_docs(docs)
+    temp = os.getenv("TEMPERATURE", str(getattr(llm, "temperature", "unknown")))
+    seed = os.getenv("LLM_SEED", "0")
+    sources = [d.metadata.get("source", "unknown") for d in docs]
+    prompt_payload = f"question:{query}\ncontext:{context_text}"
+    prompt_hash = hashlib.sha256(prompt_payload.encode("utf-8")).hexdigest()
 
     answer = ""
     for chunk in chain.stream({"question": query, "context": context_text}):
-        answer += chunk.content
-
-    sources = [d.metadata.get('source', 'unknown') for d in docs]
-    
+        if hasattr(chunk, "content"):
+            answer += chunk.content
+        elif hasattr(chunk, "text"):
+            answer += chunk.text
+        else:
+            answer += str(chunk)
+
+    sources = [d.metadata.get("source", "unknown") for d in docs]
     return answer, model_name, sources
 
 
@@ -312,16 +367,26 @@ Write your answer now (aim for 300-500+ words with extensive technical detail):"
 
     llm, model_name = get_llm()
     chain = prompt | llm
+    context_text = format_docs(docs)
+    temp = os.getenv("TEMPERATURE", str(getattr(llm, "temperature", "unknown")))
+    seed = os.getenv("LLM_SEED", "0")
+    sources = [d.metadata.get("source", "unknown") for d in docs]
+    prompt_payload = f"question:{query}\ncontext:{context_text}"
+    prompt_hash = hashlib.sha256(prompt_payload.encode("utf-8")).hexdigest()
 
     print(f"Querying model: {model_name}\n")
-    context_text = format_docs(docs)
     
     print("=== Answer ===\n")
     query_start = time.time()
     
     try:
         for chunk in chain.stream({"question": query, "context": context_text}):
-            print(chunk.content, end="", flush=True)
+            if hasattr(chunk, "content"):
+                print(chunk.content, end="", flush=True)
+            elif hasattr(chunk, "text"):
+                print(chunk.text, end="", flush=True)
+            else:
+                print(str(chunk), end="", flush=True)
         print(f"\n\n[Query completed in {time.time() - query_start:.2f}s]")
     except Exception as e:
         msg = str(e).lower()
@@ -340,4 +405,4 @@ Write your answer now (aim for 300-500+ words with extensive technical detail):"
 
 
 if __name__ == "__main__":
-    main()
+    main()
\ No newline at end of file
diff --git a/rag/transformers_llm.py b/rag/transformers_llm.py
new file mode 100644
index 0000000..5b0b0c5
--- /dev/null
+++ b/rag/transformers_llm.py
@@ -0,0 +1,280 @@
+import os
+os.environ.setdefault("TRANSFORMERS_VERBOSITY", os.getenv("TRANSFORMERS_VERBOSITY", "error"))
+
+from typing import Any, Iterator, Optional, List
+import re
+from threading import Thread
+
+import torch
+from transformers import (
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    BitsAndBytesConfig,
+    TextIteratorStreamer,
+    GenerationConfig,
+)
+
+from langchain_core.language_models.llms import BaseLLM
+from langchain_core.callbacks.manager import CallbackManagerForLLMRun
+from langchain_core.outputs import GenerationChunk, LLMResult, Generation
+
+
+class TransformersLLM(BaseLLM):
+
+    model_name: str = "Qwen/Qwen2.5-14B-Instruct"
+    max_new_tokens: int = 2048
+    temperature: float = 0.7
+    model: Any = None
+    tokenizer: Any = None
+
+    def __init__(
+        self,
+        model_name: str = "Qwen/Qwen2.5-14B-Instruct",
+        device: str = "auto",
+        dtype: str = "auto",
+        max_new_tokens: int = 2048,
+        temperature: float = 0.7,
+        quantization: str = "none",
+        **kwargs,
+    ):
+        env_model = os.getenv("TRANSFORMERS_MODEL")
+        env_quant = os.getenv("QUANTIZATION")
+        env_device = os.getenv("LLM_DEVICE")
+        env_max_tokens = os.getenv("MAX_NEW_TOKENS")
+        env_temp = os.getenv("TEMPERATURE")
+
+        if env_model:
+            model_name = env_model
+        if env_quant:
+            quantization = env_quant
+        if env_device:
+            device = env_device
+        if env_max_tokens:
+            try:
+                max_new_tokens = int(env_max_tokens)
+            except Exception:
+                pass
+        if env_temp:
+            try:
+                temperature = float(env_temp)
+            except Exception:
+                pass
+
+        super().__init__(
+            model_name=model_name,
+            max_new_tokens=max_new_tokens,
+            temperature=temperature,
+            **kwargs,
+        )
+
+        print(f"Loading model: {model_name}")
+        if quantization == "4bit":
+            print("Using 4-bit quantization (reduces memory ~75%)")
+        elif quantization == "8bit":
+            print("Using 8-bit quantization (reduces memory ~50%)")
+        else:
+            print("This will download ~28GB on first run...")
+
+        dtype_map = {
+            "float16": torch.float16,
+            "bfloat16": torch.bfloat16,
+            "float32": torch.float32,
+            "auto": "auto",
+        }
+        dtype_obj = dtype_map.get(dtype, "auto")
+
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
+
+        quantization_config = None
+        if quantization == "4bit":
+            quantization_config = BitsAndBytesConfig(
+                load_in_4bit=True,
+                bnb_4bit_compute_dtype=torch.float16,
+                bnb_4bit_use_double_quant=True,
+                bnb_4bit_quant_type="nf4",
+            )
+        elif quantization == "8bit":
+            quantization_config = BitsAndBytesConfig(load_in_8bit=True)
+
+        model_kwargs = {"device_map": device, "trust_remote_code": True}
+        if quantization_config:
+            model_kwargs["quantization_config"] = quantization_config
+        else:
+            model_kwargs["dtype"] = dtype_obj
+
+        self.model = AutoModelForCausalLM.from_pretrained(model_name, **model_kwargs)
+
+        self._generation_lock = None
+        try:
+            import threading
+
+            self._generation_lock = threading.Lock()
+        except Exception:
+            self._generation_lock = None
+
+        print(f"✓ Model loaded successfully on {self.model.device}")
+
+    @property
+    def _llm_type(self) -> str:
+        return "transformers"
+
+    def _generate(
+        self,
+        prompts: List[str],
+        stop: Optional[List[str]] = None,
+        run_manager: Optional[CallbackManagerForLLMRun] = None,
+        **kwargs: Any,
+    ) -> LLMResult:
+        generations = []
+        for prompt in prompts:
+            text = self._call(prompt, stop=stop, run_manager=run_manager, **kwargs)
+            generations.append([Generation(text=text)])
+        return LLMResult(generations=generations)
+
+    def _call(
+        self,
+        prompt: str,
+        stop: Optional[list[str]] = None,
+        run_manager: Optional[CallbackManagerForLLMRun] = None,
+        **kwargs: Any,
+    ) -> str:
+        inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)
+        temp = float(kwargs.get("temperature", self.temperature))
+        do_sample = bool(kwargs.get("do_sample", temp > 0))
+        top_p = kwargs.get("top_p", 0.9)
+        top_k = kwargs.get("top_k", None)
+
+        gen_kwargs = dict(
+            **inputs,
+            max_new_tokens=self.max_new_tokens,
+            pad_token_id=self.tokenizer.eos_token_id,
+            eos_token_id=self.tokenizer.eos_token_id,
+            use_cache=True,
+            repetition_penalty=1.1,
+        )
+
+        try:
+            gen_cfg = GenerationConfig(do_sample=do_sample)
+            if do_sample:
+                gen_cfg.temperature = temp
+                if top_p is not None:
+                    gen_cfg.top_p = float(top_p)
+                if top_k is not None:
+                    gen_cfg.top_k = int(top_k)
+            gen_kwargs["generation_config"] = gen_cfg
+        except Exception:
+            gen_kwargs["do_sample"] = do_sample
+            if do_sample:
+                gen_kwargs["temperature"] = temp
+                if top_p is not None:
+                    gen_kwargs["top_p"] = float(top_p)
+                if top_k is not None:
+                    gen_kwargs["top_k"] = int(top_k)
+
+        with torch.no_grad():
+            lock = getattr(self, "_generation_lock", None)
+            if lock is not None:
+                lock.acquire()
+            try:
+                outputs = self.model.generate(**gen_kwargs)
+            finally:
+                if lock is not None:
+                    lock.release()
+
+        response = self.tokenizer.decode(
+            outputs[0][inputs["input_ids"].shape[1] :], skip_special_tokens=True
+        )
+
+        def _clean_warnings(s: str) -> str:
+            if not s:
+                return s
+            patterns = [
+                r"The following generation flags are not valid[\s\S]*?$",
+                r"Set `TRANSFORMERS_VERBOSITY=info`[\s\S]*?$",
+                r"transformers_verbosity",
+            ]
+            out = s.replace("\r\n", "\n")
+            for p in patterns:
+                out = re.sub(p, "", out, flags=re.IGNORECASE)
+            out = re.sub(r"\n{3,}", "\n\n", out)
+            return out.strip()
+
+        return _clean_warnings(response)
+
+    def _stream(
+        self,
+        prompt: str,
+        stop: Optional[list[str]] = None,
+        run_manager: Optional[CallbackManagerForLLMRun] = None,
+        **kwargs: Any,
+    ) -> Iterator[GenerationChunk]:
+        inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)
+
+        streamer = TextIteratorStreamer(self.tokenizer, skip_prompt=True, skip_special_tokens=True)
+
+        temp = float(kwargs.get("temperature", self.temperature))
+        do_sample = bool(kwargs.get("do_sample", temp > 0))
+        top_p = kwargs.get("top_p", 0.9)
+        top_k = kwargs.get("top_k", None)
+
+        generation_kwargs = {
+            **inputs,
+            "max_new_tokens": self.max_new_tokens,
+            "pad_token_id": self.tokenizer.eos_token_id,
+            "eos_token_id": self.tokenizer.eos_token_id,
+            "streamer": streamer,
+            "use_cache": True,
+            "repetition_penalty": 1.1,
+        }
+
+        try:
+            gen_cfg = GenerationConfig(do_sample=do_sample)
+            if do_sample:
+                gen_cfg.temperature = temp
+                if top_p is not None:
+                    gen_cfg.top_p = float(top_p)
+                if top_k is not None:
+                    gen_cfg.top_k = int(top_k)
+            generation_kwargs["generation_config"] = gen_cfg
+        except Exception:
+            generation_kwargs["do_sample"] = do_sample
+            if do_sample:
+                generation_kwargs["temperature"] = temp
+                if top_p is not None:
+                    generation_kwargs["top_p"] = float(top_p)
+                if top_k is not None:
+                    generation_kwargs["top_k"] = int(top_k)
+
+        thread = Thread(target=self.model.generate, kwargs=generation_kwargs)
+        thread.start()
+
+        buffer = ""
+        patterns = [
+            r"The following generation flags are not valid[\s\S]*?(?:$|\n)",
+            r"may be ignored[\s\S]*?(?:$|\n)",
+            r"Set `TRANSFORMERS_VERBOSITY=info`[\s\S]*?(?:$|\n)",
+            r"transformers_verbosity",
+            r"\[.*temperature.*\]",
+        ]
+
+        for fragment in streamer:
+            chunk_text = fragment if isinstance(fragment, str) else str(fragment)
+            buffer += chunk_text
+            for p in patterns:
+                buffer = re.sub(p, "", buffer, flags=re.IGNORECASE)
+            if "\n" in buffer:
+                to_emit, buffer = buffer.rsplit("\n", 1)
+                to_emit = to_emit + "\n"
+                to_emit = re.sub(r"\n{3,}", "\n\n", to_emit)
+                if to_emit.strip():
+                    yield GenerationChunk(text=to_emit)
+
+        thread.join()
+        if buffer:
+            for p in patterns:
+                buffer = re.sub(p, "", buffer, flags=re.IGNORECASE)
+            buffer = re.sub(r"\n{3,}", "\n\n", buffer)
+            if buffer.strip():
+                if not buffer.endswith("\n"):
+                    buffer = buffer + "\n"
+                yield GenerationChunk(text=buffer)
diff --git a/requirements.txt b/requirements.txt
index 1a696bd..9fa605b 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,6 +1,5 @@
 langchain>=1.0.8
 langchain-community>=0.4.1
-langchain-ollama>=1.0.0
 langchain-chroma>=0.1.2
 langchain-huggingface>=0.1.0
 chromadb>=1.3.5
@@ -18,4 +17,8 @@ pdf2image>=1.17.0
 unstructured-inference>=0.7.36
 pdfminer.six>=20231228
 flashrank>=0.2.0
-sentence-transformers>=2.2.0
\ No newline at end of file
+sentence-transformers>=2.2.0
+transformers>=4.40.0
+torch>=2.0.0
+accelerate>=0.27.0
+bitsandbytes>=0.41.0
\ No newline at end of file
-- 
GitLab