diff --git a/autobot-backend/config/config.yaml b/autobot-backend/config/config.yaml index 2ece551fd..1e731f55c 100644 --- a/autobot-backend/config/config.yaml +++ b/autobot-backend/config/config.yaml @@ -15,7 +15,7 @@ backend: # gpu_endpoint: http://172.16.168.20:11434 # gpu_models: # - "qwen3.5:9b" - # - "deepseek-r1:14b" + # - "mistral:7b-instruct" # - "codellama:13b" # Infrastructure host overrides diff --git a/autobot-backend/knowledge/pipeline/cognifiers/cognifiers_test.py b/autobot-backend/knowledge/pipeline/cognifiers/cognifiers_test.py index 563c26129..321425529 100644 --- a/autobot-backend/knowledge/pipeline/cognifiers/cognifiers_test.py +++ b/autobot-backend/knowledge/pipeline/cognifiers/cognifiers_test.py @@ -642,7 +642,7 @@ def test_cache_hit_skips_summary_llm_call(self, mock_get_redis): ) cached_payload = json.dumps( - {"summary": "cached summary", "model": "llama3.2:3b"} + {"summary": "cached summary", "model": "llama3.2:1b"} ) mock_get_redis.return_value = self._mock_redis(cached=cached_payload) cog = ContextGeneratorCognifier() diff --git a/autobot-backend/performance_benchmarks.performance_test.py b/autobot-backend/performance_benchmarks.performance_test.py index 7b4534354..ceb913eeb 100644 --- a/autobot-backend/performance_benchmarks.performance_test.py +++ b/autobot-backend/performance_benchmarks.performance_test.py @@ -308,7 +308,7 @@ def setup_method(self): with patch("src.orchestrator.global_config_manager") as mock_config: mock_config.get_llm_config.return_value = { - "orchestrator_llm": "llama3.2:3b" + "orchestrator_llm": "llama3.2:1b" } self.orchestrator = Orchestrator() diff --git a/autobot-infrastructure/shared/scripts/ai-ml/optimize_llm_models.py b/autobot-infrastructure/shared/scripts/ai-ml/optimize_llm_models.py index 0bd3152cc..15e4768b8 100644 --- a/autobot-infrastructure/shared/scripts/ai-ml/optimize_llm_models.py +++ b/autobot-infrastructure/shared/scripts/ai-ml/optimize_llm_models.py @@ -248,7 +248,7 @@ def _get_config_updates(self) -> dict: }], "src/config.py": [ { - "find": f'"orchestrator": os.getenv("AUTOBOT_ORCHESTRATOR_MODEL", "llama3.2:3b")', + "find": f'"orchestrator": os.getenv("AUTOBOT_ORCHESTRATOR_MODEL", "llama3.2:1b")', "replace": f'"orchestrator": os.getenv("AUTOBOT_ORCHESTRATOR_MODEL", "{_ROUTING_MODEL}")', "line_context": "models configuration — orchestrator", }, @@ -259,7 +259,7 @@ def _get_config_updates(self) -> dict: }, ], "backend/utils/connection_utils.py": [{ - "find": '"deepseek-r1:14b"', + "find": '"qwen3.5:9b"', "replace": f'"{_DEFAULT_MODEL}"', "line_context": "AUTOBOT_DEFAULT_LLM_MODEL default", }], diff --git a/autobot-infrastructure/shared/scripts/utilities/fix_settings_loading.py b/autobot-infrastructure/shared/scripts/utilities/fix_settings_loading.py index 603a5c7a8..3bde35ad4 100644 --- a/autobot-infrastructure/shared/scripts/utilities/fix_settings_loading.py +++ b/autobot-infrastructure/shared/scripts/utilities/fix_settings_loading.py @@ -209,7 +209,7 @@ def generate_browser_fix_script(self): use_phi2: false, api_endpoint: 'http://localhost:8001', ollama_endpoint: 'http://localhost:11434', - ollama_model: 'deepseek-r1:14b', + ollama_model: 'qwen3.5:9b', streaming: false }, ui: { diff --git a/autobot-infrastructure/shared/scripts/utilities/npu_worker.py b/autobot-infrastructure/shared/scripts/utilities/npu_worker.py index 7f37c024f..1f778eeaa 100644 --- a/autobot-infrastructure/shared/scripts/utilities/npu_worker.py +++ b/autobot-infrastructure/shared/scripts/utilities/npu_worker.py @@ -508,7 +508,7 @@ async def get_available_models(self) -> List[str]: """Get list of available models for NPU.""" return [ "llama3.2:1b-instruct-q4_K_M", - "llama3.2:3b-instruct-q4_K_M", + "llama3.2:1b-instruct-q4_K_M", "nomic-embed-text", "text-classification-model", ] diff --git a/autobot-infrastructure/shared/scripts/utilities/npu_worker_design.py b/autobot-infrastructure/shared/scripts/utilities/npu_worker_design.py index dfd64192b..a018ef836 100644 --- a/autobot-infrastructure/shared/scripts/utilities/npu_worker_design.py +++ b/autobot-infrastructure/shared/scripts/utilities/npu_worker_design.py @@ -89,8 +89,8 @@ def get_components(self) -> Dict[str, Any]: ], "models": [ "artifish/llama3.2-uncensored:latest (2.2GB) - GPU", - "deepseek-r1:14b (8.4GB) - GPU for complex reasoning", - "llama3.2:3b-instruct-q4_K_M (2GB) - GPU backup", + "qwen3.5:9b (5.2GB) - GPU for complex reasoning", + "mistral:7b-instruct (4.1GB) - GPU backup", ], }, "windows_npu_worker": { diff --git a/autobot-infrastructure/shared/scripts/utilities/optimize_gpu_usage.py b/autobot-infrastructure/shared/scripts/utilities/optimize_gpu_usage.py index bf4a4d750..bea9f240a 100644 --- a/autobot-infrastructure/shared/scripts/utilities/optimize_gpu_usage.py +++ b/autobot-infrastructure/shared/scripts/utilities/optimize_gpu_usage.py @@ -192,18 +192,18 @@ def create_model_recommendations(): "orchestrator": "artifish/llama3.2-uncensored:latest", # 2.2GB "rag": "artifish/llama3.2-uncensored:latest", # 2.2GB "research": "artifish/llama3.2-uncensored:latest", # 2.2GB - "chat": "llama3.2:3b-instruct-q4_K_M", # 2GB + "chat": "mistral:7b-instruct", # 4.1GB "analysis": "artifish/llama3.2-uncensored:latest", # 2.2GB "planning": "artifish/llama3.2-uncensored:latest", # 2.2GB } parallel_capacity = "2-3 concurrent models" elif total_memory >= 6000: # 6GB GPU recommended_models = { - "orchestrator": "llama3.2:3b-instruct-q4_K_M", # 2GB + "orchestrator": "llama3.2:1b-instruct-q4_K_M", # 807MB "rag": "artifish/llama3.2-uncensored:latest", # 2.2GB - "research": "llama3.2:3b-instruct-q4_K_M", # 2GB + "research": "mistral:7b-instruct", # 4.1GB "chat": "llama3.2:1b-instruct-q4_K_M", # 807MB - "analysis": "llama3.2:3b-instruct-q4_K_M", # 2GB + "analysis": "mistral:7b-instruct", # 4.1GB } parallel_capacity = "2 concurrent models" else: # 4GB GPU diff --git a/docs/frontend/settings-panel-guide.md b/docs/frontend/settings-panel-guide.md index 0e39c74d4..b7d481d20 100644 --- a/docs/frontend/settings-panel-guide.md +++ b/docs/frontend/settings-panel-guide.md @@ -378,8 +378,8 @@ const llmConfig = { providers: { ollama: { endpoint: 'http://localhost:11434/api/generate', - models: ['deepseek-r1:14b', 'dolphin-llama3:8b'], - selected_model: 'deepseek-r1:14b' + models: ['qwen3.5:9b', 'dolphin-llama3:8b'], + selected_model: 'qwen3.5:9b' } } }, diff --git a/docs/guides/LLM_Interface_Migration_Guide.md b/docs/guides/LLM_Interface_Migration_Guide.md index 45d8e8a1e..089db766a 100644 --- a/docs/guides/LLM_Interface_Migration_Guide.md +++ b/docs/guides/LLM_Interface_Migration_Guide.md @@ -93,7 +93,7 @@ llm: ollama: enabled: true base_url: "http://localhost:11434" - default_model: "deepseek-r1:14b" + default_model: "qwen3.5:9b" openai: enabled: false api_key: "" diff --git a/docs/guides/chat-ollama-configuration.md b/docs/guides/chat-ollama-configuration.md index 818f5349c..97af47e1f 100644 --- a/docs/guides/chat-ollama-configuration.md +++ b/docs/guides/chat-ollama-configuration.md @@ -179,7 +179,7 @@ backend: # gpu_endpoint: http://172.16.168.20:11434 # gpu_models: # - "qwen3.5:9b" - # - "deepseek-r1:14b" + # - "mistral:7b-instruct" # - "codellama:13b" # Infrastructure host overrides @@ -1018,7 +1018,7 @@ backend: gpu_endpoint: http://172.16.168.20:11434 # GPU-accelerated endpoint gpu_models: - "qwen3.5:9b" - - "deepseek-r1:14b" + - "mistral:7b-instruct" - "codellama:13b" ``` diff --git a/docs/guides/llm-middleware-telemetry.md b/docs/guides/llm-middleware-telemetry.md index e39f7d9b4..a3626e33b 100644 --- a/docs/guides/llm-middleware-telemetry.md +++ b/docs/guides/llm-middleware-telemetry.md @@ -1054,7 +1054,7 @@ backend: # gpu_endpoint: http://172.16.168.20:11434 # gpu_models: # - "qwen3.5:9b" - # - "deepseek-r1:14b" + # - "mistral:7b-instruct" # Fallback path for _get_ollama_endpoint_fallback() via get_host("ollama") infrastructure: diff --git a/docs/migration/LLM_Interface_Migration_Guide.md b/docs/migration/LLM_Interface_Migration_Guide.md index 45d8e8a1e..089db766a 100644 --- a/docs/migration/LLM_Interface_Migration_Guide.md +++ b/docs/migration/LLM_Interface_Migration_Guide.md @@ -93,7 +93,7 @@ llm: ollama: enabled: true base_url: "http://localhost:11434" - default_model: "deepseek-r1:14b" + default_model: "qwen3.5:9b" openai: enabled: false api_key: "" diff --git a/docs/plans/2026-02-02-knowledge-graph-enhancement-design.md b/docs/plans/2026-02-02-knowledge-graph-enhancement-design.md index 12c5bba21..29dd8a6d3 100644 --- a/docs/plans/2026-02-02-knowledge-graph-enhancement-design.md +++ b/docs/plans/2026-02-02-knowledge-graph-enhancement-design.md @@ -109,15 +109,15 @@ stages: cognify: tasks: - name: extract_entities - model: llama3.2:3b + model: llama3.2:1b batch_size: 50 confidence_threshold: 0.7 - name: extract_relationships - model: llama3.2:3b + model: llama3.2:1b batch_size: 50 - name: extract_events enabled: true - model: llama3.2:3b + model: llama3.2:1b - name: generate_summaries enabled: true levels: [chunk, section, document] @@ -451,7 +451,7 @@ class EntityExtractor(BaseCognifier): """LLM-powered entity extraction from text chunks.""" def __init__(self, config: dict): - self.model = config.get('model', 'llama3.2:3b') + self.model = config.get('model', 'llama3.2:1b') self.confidence_threshold = config.get('confidence_threshold', 0.7) self.llm = LLMService(model=self.model) @@ -565,7 +565,7 @@ class RelationshipExtractor(BaseCognifier): """LLM-powered relationship extraction.""" def __init__(self, config: dict): - self.model = config.get('model', 'llama3.2:3b') + self.model = config.get('model', 'llama3.2:1b') self.llm = LLMService(model=self.model) async def process( @@ -786,7 +786,7 @@ class EventExtractor(BaseCognifier): """LLM-powered event and temporal extraction.""" def __init__(self, config: dict): - self.model = config.get('model', 'llama3.2:3b') + self.model = config.get('model', 'llama3.2:1b') self.llm = LLMService(model=self.model) async def process( @@ -1036,7 +1036,7 @@ class HierarchicalSummarizer(BaseCognifier): """Generate multi-level summaries of documents.""" def __init__(self, config: dict): - self.model = config.get('model', 'llama3.2:3b') + self.model = config.get('model', 'llama3.2:1b') self.levels = config.get('levels', ['chunk', 'section', 'document']) self.max_words = config.get('max_words', { 'chunk': 50, @@ -1449,7 +1449,7 @@ autobot-frontend/src/components/knowledge/ # Add to config/ssot_config.py knowledge_graph: pipeline: - default_model: "llama3.2:3b" + default_model: "llama3.2:1b" batch_size: 50 confidence_threshold: 0.7 temporal: diff --git a/docs/plans/2026-02-02-phase3-client-library-design.md b/docs/plans/2026-02-02-phase3-client-library-design.md index f734522a5..ec95656b1 100644 --- a/docs/plans/2026-02-02-phase3-client-library-design.md +++ b/docs/plans/2026-02-02-phase3-client-library-design.md @@ -167,7 +167,7 @@ export async function discoverService( | Local Tier | Default Model | SLM Provider | |------------|---------------|--------------| | TIER_1 | llama3.2:1b | ollama | -| TIER_2 | llama3.2:3b | ollama | +| TIER_2 | llama3.2:1b | ollama | | TIER_3 | qwen3.5:9b | ollama | | TIER_4 | qwen3.5:9b | ollama |