mrveiss · mrveiss · Mar 30, 2026 · Mar 30, 2026
@@ -15,7 +15,7 @@ backend:
       # gpu_endpoint: http://172.16.168.20:11434
       # gpu_models:
       #   - "qwen3.5:9b"
-      #   - "deepseek-r1:14b"
+      #   - "mistral:7b-instruct"
       #   - "codellama:13b"
 
 # Infrastructure host overrides

@@ -642,7 +642,7 @@ def test_cache_hit_skips_summary_llm_call(self, mock_get_redis):
         )
 
         cached_payload = json.dumps(
-            {"summary": "cached summary", "model": "llama3.2:3b"}
+            {"summary": "cached summary", "model": "llama3.2:1b"}
         )
         mock_get_redis.return_value = self._mock_redis(cached=cached_payload)
         cog = ContextGeneratorCognifier()

@@ -308,7 +308,7 @@ def setup_method(self):
 
         with patch("src.orchestrator.global_config_manager") as mock_config:
             mock_config.get_llm_config.return_value = {
-                "orchestrator_llm": "llama3.2:3b"
+                "orchestrator_llm": "llama3.2:1b"
             }
             self.orchestrator = Orchestrator()
 

@@ -248,7 +248,7 @@ def _get_config_updates(self) -> dict:
             }],
             "src/config.py": [
                 {
-                    "find": f'"orchestrator": os.getenv("AUTOBOT_ORCHESTRATOR_MODEL", "llama3.2:3b")',
+                    "find": f'"orchestrator": os.getenv("AUTOBOT_ORCHESTRATOR_MODEL", "llama3.2:1b")',
                     "replace": f'"orchestrator": os.getenv("AUTOBOT_ORCHESTRATOR_MODEL", "{_ROUTING_MODEL}")',
                     "line_context": "models configuration — orchestrator",
                 },
@@ -259,7 +259,7 @@ def _get_config_updates(self) -> dict:
                 },
             ],
             "backend/utils/connection_utils.py": [{
-                "find": '"deepseek-r1:14b"',
+                "find": '"qwen3.5:9b"',
                 "replace": f'"{_DEFAULT_MODEL}"',
                 "line_context": "AUTOBOT_DEFAULT_LLM_MODEL default",
             }],

@@ -209,7 +209,7 @@ def generate_browser_fix_script(self):
             use_phi2: false,
             api_endpoint: 'http://localhost:8001',
             ollama_endpoint: 'http://localhost:11434',
-            ollama_model: 'deepseek-r1:14b',
+            ollama_model: 'qwen3.5:9b',
             streaming: false
         },
         ui: {

@@ -508,7 +508,7 @@ async def get_available_models(self) -> List[str]:
         """Get list of available models for NPU."""
         return [
             "llama3.2:1b-instruct-q4_K_M",
-            "llama3.2:3b-instruct-q4_K_M",
+            "llama3.2:1b-instruct-q4_K_M",
             "nomic-embed-text",
             "text-classification-model",
         ]

@@ -89,8 +89,8 @@ def get_components(self) -> Dict[str, Any]:
                 ],
                 "models": [
                     "artifish/llama3.2-uncensored:latest (2.2GB) - GPU",
-                    "deepseek-r1:14b (8.4GB) - GPU for complex reasoning",
-                    "llama3.2:3b-instruct-q4_K_M (2GB) - GPU backup",
+                    "qwen3.5:9b (5.2GB) - GPU for complex reasoning",
+                    "mistral:7b-instruct (4.1GB) - GPU backup",
                 ],
             },
             "windows_npu_worker": {

@@ -192,18 +192,18 @@ def create_model_recommendations():
                 "orchestrator": "artifish/llama3.2-uncensored:latest",  # 2.2GB
                 "rag": "artifish/llama3.2-uncensored:latest",  # 2.2GB
                 "research": "artifish/llama3.2-uncensored:latest",  # 2.2GB
-                "chat": "llama3.2:3b-instruct-q4_K_M",  # 2GB
+                "chat": "mistral:7b-instruct",  # 4.1GB
                 "analysis": "artifish/llama3.2-uncensored:latest",  # 2.2GB
                 "planning": "artifish/llama3.2-uncensored:latest",  # 2.2GB
             }
             parallel_capacity = "2-3 concurrent models"
         elif total_memory >= 6000:  # 6GB GPU
             recommended_models = {
-                "orchestrator": "llama3.2:3b-instruct-q4_K_M",  # 2GB
+                "orchestrator": "llama3.2:1b-instruct-q4_K_M",  # 807MB
                 "rag": "artifish/llama3.2-uncensored:latest",  # 2.2GB
-                "research": "llama3.2:3b-instruct-q4_K_M",  # 2GB
+                "research": "mistral:7b-instruct",  # 4.1GB
                 "chat": "llama3.2:1b-instruct-q4_K_M",  # 807MB
-                "analysis": "llama3.2:3b-instruct-q4_K_M",  # 2GB
+                "analysis": "mistral:7b-instruct",  # 4.1GB
             }
             parallel_capacity = "2 concurrent models"
         else:  # 4GB GPU

@@ -378,8 +378,8 @@ const llmConfig = {
     providers: {
       ollama: {
         endpoint: 'http://localhost:11434/api/generate',
-        models: ['deepseek-r1:14b', 'dolphin-llama3:8b'],
-        selected_model: 'deepseek-r1:14b'
+        models: ['qwen3.5:9b', 'dolphin-llama3:8b'],
+        selected_model: 'qwen3.5:9b'
       }
     }
   },

@@ -93,7 +93,7 @@ llm:
   ollama:
     enabled: true
     base_url: "http://localhost:11434"
-    default_model: "deepseek-r1:14b"
+    default_model: "qwen3.5:9b"
   openai:
     enabled: false
     api_key: ""

@@ -179,7 +179,7 @@ backend:
       # gpu_endpoint: http://172.16.168.20:11434
       # gpu_models:
       #   - "qwen3.5:9b"
-      #   - "deepseek-r1:14b"
+      #   - "mistral:7b-instruct"
       #   - "codellama:13b"
 
 # Infrastructure host overrides
@@ -1018,7 +1018,7 @@ backend:
       gpu_endpoint: http://172.16.168.20:11434    # GPU-accelerated endpoint
       gpu_models:
         - "qwen3.5:9b"
-        - "deepseek-r1:14b"
+        - "mistral:7b-instruct"
         - "codellama:13b"
 ```
 

@@ -1054,7 +1054,7 @@ backend:
       # gpu_endpoint: http://172.16.168.20:11434
       # gpu_models:
       #   - "qwen3.5:9b"
-      #   - "deepseek-r1:14b"
+      #   - "mistral:7b-instruct"
 
 # Fallback path for _get_ollama_endpoint_fallback() via get_host("ollama")
 infrastructure:

@@ -93,7 +93,7 @@ llm:
   ollama:
     enabled: true
     base_url: "http://localhost:11434"
-    default_model: "deepseek-r1:14b"
+    default_model: "qwen3.5:9b"
   openai:
     enabled: false
     api_key: ""

@@ -109,15 +109,15 @@ stages:
   cognify:
     tasks:
       - name: extract_entities
-        model: llama3.2:3b
+        model: llama3.2:1b
         batch_size: 50
         confidence_threshold: 0.7
       - name: extract_relationships
-        model: llama3.2:3b
+        model: llama3.2:1b
         batch_size: 50
       - name: extract_events
         enabled: true
-        model: llama3.2:3b
+        model: llama3.2:1b
       - name: generate_summaries
         enabled: true
         levels: [chunk, section, document]
@@ -451,7 +451,7 @@ class EntityExtractor(BaseCognifier):
     """LLM-powered entity extraction from text chunks."""
 
     def __init__(self, config: dict):
-        self.model = config.get('model', 'llama3.2:3b')
+        self.model = config.get('model', 'llama3.2:1b')
         self.confidence_threshold = config.get('confidence_threshold', 0.7)
         self.llm = LLMService(model=self.model)
 
@@ -565,7 +565,7 @@ class RelationshipExtractor(BaseCognifier):
     """LLM-powered relationship extraction."""
 
     def __init__(self, config: dict):
-        self.model = config.get('model', 'llama3.2:3b')
+        self.model = config.get('model', 'llama3.2:1b')
         self.llm = LLMService(model=self.model)
 
     async def process(
@@ -786,7 +786,7 @@ class EventExtractor(BaseCognifier):
     """LLM-powered event and temporal extraction."""
 
     def __init__(self, config: dict):
-        self.model = config.get('model', 'llama3.2:3b')
+        self.model = config.get('model', 'llama3.2:1b')
         self.llm = LLMService(model=self.model)
 
     async def process(
@@ -1036,7 +1036,7 @@ class HierarchicalSummarizer(BaseCognifier):
     """Generate multi-level summaries of documents."""
 
     def __init__(self, config: dict):
-        self.model = config.get('model', 'llama3.2:3b')
+        self.model = config.get('model', 'llama3.2:1b')
         self.levels = config.get('levels', ['chunk', 'section', 'document'])
         self.max_words = config.get('max_words', {
             'chunk': 50,
@@ -1449,7 +1449,7 @@ autobot-frontend/src/components/knowledge/
 # Add to config/ssot_config.py
 knowledge_graph:
   pipeline:
-    default_model: "llama3.2:3b"
+    default_model: "llama3.2:1b"
     batch_size: 50
     confidence_threshold: 0.7
   temporal:

@@ -167,7 +167,7 @@ export async function discoverService(
 | Local Tier | Default Model | SLM Provider |
 |------------|---------------|--------------|
 | TIER_1 | llama3.2:1b | ollama |
-| TIER_2 | llama3.2:3b | ollama |
+| TIER_2 | llama3.2:1b | ollama |
 | TIER_3 | qwen3.5:9b | ollama |
 | TIER_4 | qwen3.5:9b | ollama |