{
  "metadata": {
    "generated_at": "2026-05-27T10:20:04.960Z",
    "project": "Can My GPU Run This LLM",
    "dataset_type": "agent_model_guide",
    "model_count": 23,
    "source_files": [
      "/data/models.json",
      "/data/applications.json"
    ],
    "policy": "Agent-facing planning data. Estimates are practical guidance, not benchmarks.",
    "commercial_policy": "Technical recommendations and compatibility status must not be changed by affiliate, referral, sponsorship, or discount relationships.",
    "ranking_influenced_by_affiliate": false,
    "last_curated_by": "Codex"
  },
  "context_profiles": [
    {
      "id": "quick",
      "label": "Quick chat / small prompt",
      "multiplier": 0.9,
      "note": "Short Q&A, shell help, small config snippets, quick translation."
    },
    {
      "id": "coding",
      "label": "Coding assistant / scripts",
      "multiplier": 1,
      "note": "Focused coding, small repo edits, review support, debugging one or two files."
    },
    {
      "id": "repo",
      "label": "Repo or long chat",
      "multiplier": 1.15,
      "note": "Longer conversations, README plus source files, multi-step code reasoning."
    },
    {
      "id": "rag",
      "label": "PDF / document analysis",
      "multiplier": 1.35,
      "note": "Document summaries, meeting notes, research pages, RAG-style retrieval prompts."
    },
    {
      "id": "agent",
      "label": "Agent with tools",
      "multiplier": 1.45,
      "note": "Tool calls, planning loops, repeated instructions, memory, and workflow state."
    },
    {
      "id": "extreme",
      "label": "Extreme long context",
      "multiplier": 1.7,
      "note": "Large document batches, whole-project context, heavy RAG, or long autonomous sessions."
    }
  ],
  "quantization_profiles": [
    {
      "id": "q4",
      "bits": 4
    },
    {
      "id": "q5",
      "bits": 5
    },
    {
      "id": "q8",
      "bits": 8
    },
    {
      "id": "fp16",
      "bits": 16
    }
  ],
  "models": [
    {
      "id": "llama-3-1-8b",
      "model_name": "Llama 3.1 8B Instruct",
      "family": "Llama",
      "provider": "Meta",
      "license_type": "open-weight",
      "parameters_billions": 8,
      "architecture": "dense",
      "ollama_tag": "llama3.1:8b",
      "recommended_quantization": "Q4_K_M",
      "best_for": [
        "chat",
        "agents/tool workflows",
        "Fast local chat, lightweight agents, low-cost local testing"
      ],
      "weak_for": [
        "coding",
        "vision/image understanding",
        "reasoning",
        "Complex coding and deep reasoning"
      ],
      "not_for": [
        "image understanding / multimodal vision",
        "coding-heavy workloads",
        "deep multi-step reasoning"
      ],
      "agent_readiness_score": 78,
      "context": {
        "minimum_context_tokens": 8192,
        "typical_context_tokens": 128000,
        "context_fit_note": "Large context-capable on paper; verify runtime memory and backend limits."
      },
      "memory_estimates_gb": {
        "quantization": {
          "q4": 6,
          "q5": 7.5,
          "q8": 12,
          "fp16": 24
        },
        "q4_context_profiles": {
          "quick": 5.4,
          "coding": 6,
          "repo": 6.9,
          "rag": 8.1,
          "agent": 8.7,
          "extreme": 10.2
        },
        "caveat": "Estimates are practical planning estimates. Real use varies by backend, KV cache, context length, drivers, quantization file, and offloading."
      },
      "hardware": {
        "minimum_vram_gb_q4": 8,
        "comfortable_vram_gb_q4": 12,
        "recommended_gpu": "RTX 4060 8GB or better",
        "local_fit_note": "Good small-local-model candidate for 8GB+ GPUs at Q4."
      },
      "routing_guidance": {
        "green": "Prefer local inference first if the selected GPU has comfortable headroom.",
        "yellow": "Reduce context, use lower quantization, try a smaller model, or expect RAM/offload slowdown.",
        "red": "Use a smaller local model, larger GPU, cloud GPU fallback, or compare API/cloud costs.",
        "commercial_option_ids": [],
        "ranking_influenced_by_affiliate": false
      },
      "recommended_applications": [
        {
          "id": "hermes-kellerrechner-agent",
          "name": "Hermes / Kellerrechner Agent",
          "category": "Personal AI assistant"
        },
        {
          "id": "gravity-claw-business-operator",
          "name": "Gravity Claw Business Operator",
          "category": "Business automation agent"
        },
        {
          "id": "telegram-ai-bot",
          "name": "Telegram AI Bot",
          "category": "Chat and automation bot"
        },
        {
          "id": "knowledge-vault-agent",
          "name": "Obsidian / Knowledge Vault Agent",
          "category": "Knowledge and memory agent"
        },
        {
          "id": "social-publishing-approval-worker",
          "name": "Social Publishing Approval Worker",
          "category": "Marketing automation agent"
        }
      ],
      "sources": [
        {
          "label": "Ollama library",
          "url": "https://ollama.com/library/llama3.1"
        }
      ],
      "last_verified": "2026-05-19",
      "confidence": "high"
    },
    {
      "id": "llama-3-1-70b",
      "model_name": "Llama 3.1 70B Instruct",
      "family": "Llama",
      "provider": "Meta",
      "license_type": "open-weight",
      "parameters_billions": 70,
      "architecture": "dense",
      "ollama_tag": "llama3.1:70b",
      "recommended_quantization": "Q4_K_M",
      "best_for": [
        "chat",
        "coding",
        "agents/tool workflows",
        "reasoning",
        "High-quality local chat and reasoning on workstation-class hardware"
      ],
      "weak_for": [
        "vision/image understanding",
        "Too large for single 24GB consumer GPUs without heavy offload"
      ],
      "not_for": [
        "image understanding / multimodal vision"
      ],
      "agent_readiness_score": 88,
      "context": {
        "minimum_context_tokens": 8192,
        "typical_context_tokens": 128000,
        "context_fit_note": "Large context-capable on paper; verify runtime memory and backend limits."
      },
      "memory_estimates_gb": {
        "quantization": {
          "q4": 44,
          "q5": 55,
          "q8": 88,
          "fp16": 176
        },
        "q4_context_profiles": {
          "quick": 39.6,
          "coding": 44,
          "repo": 50.6,
          "rag": 59.4,
          "agent": 63.8,
          "extreme": 74.8
        },
        "caveat": "Estimates are practical planning estimates. Real use varies by backend, KV cache, context length, drivers, quantization file, and offloading."
      },
      "hardware": {
        "minimum_vram_gb_q4": 48,
        "comfortable_vram_gb_q4": 64,
        "recommended_gpu": "48GB+ VRAM workstation or multi-GPU setup",
        "local_fit_note": "Large local model. Prefer 48GB+ VRAM, multi-GPU, cloud GPU, or hosted API fallback."
      },
      "routing_guidance": {
        "green": "Prefer local inference first if the selected GPU has comfortable headroom.",
        "yellow": "Reduce context, use lower quantization, try a smaller model, or expect RAM/offload slowdown.",
        "red": "Use a smaller local model, larger GPU, cloud GPU fallback, or compare API/cloud costs.",
        "commercial_option_ids": [
          "runpod-cloud-gpu-fallback",
          "apiroute-cloud-api-comparison"
        ],
        "ranking_influenced_by_affiliate": false
      },
      "recommended_applications": [
        {
          "id": "desktop-multi-agent-command-center",
          "name": "Desktop Multi-Agent Command Center",
          "category": "Agent orchestration"
        }
      ],
      "sources": [
        {
          "label": "Ollama library",
          "url": "https://ollama.com/library/llama3.1"
        }
      ],
      "last_verified": "2026-05-19",
      "confidence": "high"
    },
    {
      "id": "qwen2-5-coder-7b",
      "model_name": "Qwen2.5 Coder 7B",
      "family": "Qwen",
      "provider": "Alibaba",
      "license_type": "open-weight",
      "parameters_billions": 7,
      "architecture": "dense",
      "ollama_tag": "qwen2.5-coder:7b",
      "recommended_quantization": "Q4_K_M",
      "best_for": [
        "chat",
        "coding",
        "agents/tool workflows",
        "Small local coding assistant and agent tool generation"
      ],
      "weak_for": [
        "vision/image understanding",
        "reasoning",
        "Larger refactors and complex multi-file reasoning"
      ],
      "not_for": [
        "image understanding / multimodal vision",
        "deep multi-step reasoning"
      ],
      "agent_readiness_score": 82,
      "context": {
        "minimum_context_tokens": 8192,
        "typical_context_tokens": 32768,
        "context_fit_note": "Good practical context range for coding, chat, and moderate RAG."
      },
      "memory_estimates_gb": {
        "quantization": {
          "q4": 5.5,
          "q5": 6.88,
          "q8": 11,
          "fp16": 22
        },
        "q4_context_profiles": {
          "quick": 4.95,
          "coding": 5.5,
          "repo": 6.32,
          "rag": 7.43,
          "agent": 7.97,
          "extreme": 9.35
        },
        "caveat": "Estimates are practical planning estimates. Real use varies by backend, KV cache, context length, drivers, quantization file, and offloading."
      },
      "hardware": {
        "minimum_vram_gb_q4": 8,
        "comfortable_vram_gb_q4": 12,
        "recommended_gpu": "RTX 4060 8GB or better",
        "local_fit_note": "Good small-local-model candidate for 8GB+ GPUs at Q4."
      },
      "routing_guidance": {
        "green": "Prefer local inference first if the selected GPU has comfortable headroom.",
        "yellow": "Reduce context, use lower quantization, try a smaller model, or expect RAM/offload slowdown.",
        "red": "Use a smaller local model, larger GPU, cloud GPU fallback, or compare API/cloud costs.",
        "commercial_option_ids": [],
        "ranking_influenced_by_affiliate": false
      },
      "recommended_applications": [
        {
          "id": "social-publishing-approval-worker",
          "name": "Social Publishing Approval Worker",
          "category": "Marketing automation agent"
        }
      ],
      "sources": [
        {
          "label": "Ollama library",
          "url": "https://ollama.com/library/qwen2.5-coder"
        }
      ],
      "last_verified": "2026-05-19",
      "confidence": "high"
    },
    {
      "id": "qwen2-5-coder-14b",
      "model_name": "Qwen2.5 Coder 14B",
      "family": "Qwen",
      "provider": "Alibaba",
      "license_type": "open-weight",
      "parameters_billions": 14,
      "architecture": "dense",
      "ollama_tag": "qwen2.5-coder:14b",
      "recommended_quantization": "Q4_K_M",
      "best_for": [
        "chat",
        "coding",
        "agents/tool workflows",
        "reasoning",
        "Local coding, scripts, repo assistance, technical agents"
      ],
      "weak_for": [
        "vision/image understanding",
        "Can be tight on 12GB GPUs at longer context"
      ],
      "not_for": [
        "image understanding / multimodal vision"
      ],
      "agent_readiness_score": 88,
      "context": {
        "minimum_context_tokens": 8192,
        "typical_context_tokens": 32768,
        "context_fit_note": "Good practical context range for coding, chat, and moderate RAG."
      },
      "memory_estimates_gb": {
        "quantization": {
          "q4": 10.5,
          "q5": 13.13,
          "q8": 21,
          "fp16": 42
        },
        "q4_context_profiles": {
          "quick": 9.45,
          "coding": 10.5,
          "repo": 12.07,
          "rag": 14.18,
          "agent": 15.22,
          "extreme": 17.85
        },
        "caveat": "Estimates are practical planning estimates. Real use varies by backend, KV cache, context length, drivers, quantization file, and offloading."
      },
      "hardware": {
        "minimum_vram_gb_q4": 12,
        "comfortable_vram_gb_q4": 16,
        "recommended_gpu": "RTX 3060 12GB minimum, 16GB+ preferred",
        "local_fit_note": "Practical 12GB local-agent candidate at Q4 with headroom checks."
      },
      "routing_guidance": {
        "green": "Prefer local inference first if the selected GPU has comfortable headroom.",
        "yellow": "Reduce context, use lower quantization, try a smaller model, or expect RAM/offload slowdown.",
        "red": "Use a smaller local model, larger GPU, cloud GPU fallback, or compare API/cloud costs.",
        "commercial_option_ids": [],
        "ranking_influenced_by_affiliate": false
      },
      "recommended_applications": [
        {
          "id": "hermes-kellerrechner-agent",
          "name": "Hermes / Kellerrechner Agent",
          "category": "Personal AI assistant"
        },
        {
          "id": "gravity-claw-business-operator",
          "name": "Gravity Claw Business Operator",
          "category": "Business automation agent"
        },
        {
          "id": "local-coding-agent",
          "name": "Local Coding Agent",
          "category": "Coding agent"
        },
        {
          "id": "knowledge-vault-agent",
          "name": "Obsidian / Knowledge Vault Agent",
          "category": "Knowledge and memory agent"
        }
      ],
      "sources": [
        {
          "label": "Ollama library",
          "url": "https://ollama.com/library/qwen2.5-coder"
        }
      ],
      "last_verified": "2026-05-19",
      "confidence": "high"
    },
    {
      "id": "qwen2-5-coder-32b",
      "model_name": "Qwen2.5 Coder 32B",
      "family": "Qwen",
      "provider": "Alibaba",
      "license_type": "open-weight",
      "parameters_billions": 32,
      "architecture": "dense",
      "ollama_tag": "qwen2.5-coder:32b",
      "recommended_quantization": "Q4_K_M",
      "best_for": [
        "chat",
        "coding",
        "agents/tool workflows",
        "reasoning",
        "Strong local coding and architecture work on 24GB GPUs"
      ],
      "weak_for": [
        "vision/image understanding",
        "Little VRAM headroom on single 24GB GPUs with long context"
      ],
      "not_for": [
        "image understanding / multimodal vision"
      ],
      "agent_readiness_score": 92,
      "context": {
        "minimum_context_tokens": 8192,
        "typical_context_tokens": 32768,
        "context_fit_note": "Good practical context range for coding, chat, and moderate RAG."
      },
      "memory_estimates_gb": {
        "quantization": {
          "q4": 21,
          "q5": 26.25,
          "q8": 42,
          "fp16": 84
        },
        "q4_context_profiles": {
          "quick": 18.9,
          "coding": 21,
          "repo": 24.15,
          "rag": 28.35,
          "agent": 30.45,
          "extreme": 35.7
        },
        "caveat": "Estimates are practical planning estimates. Real use varies by backend, KV cache, context length, drivers, quantization file, and offloading."
      },
      "hardware": {
        "minimum_vram_gb_q4": 24,
        "comfortable_vram_gb_q4": 32,
        "recommended_gpu": "RTX 3090/4090 24GB minimum, 32GB+ preferred",
        "local_fit_note": "Workstation-local candidate. Prefer 32GB+ VRAM for agents or long context."
      },
      "routing_guidance": {
        "green": "Prefer local inference first if the selected GPU has comfortable headroom.",
        "yellow": "Reduce context, use lower quantization, try a smaller model, or expect RAM/offload slowdown.",
        "red": "Use a smaller local model, larger GPU, cloud GPU fallback, or compare API/cloud costs.",
        "commercial_option_ids": [
          "apiroute-cloud-api-comparison"
        ],
        "ranking_influenced_by_affiliate": false
      },
      "recommended_applications": [
        {
          "id": "gravity-claw-business-operator",
          "name": "Gravity Claw Business Operator",
          "category": "Business automation agent"
        },
        {
          "id": "local-coding-agent",
          "name": "Local Coding Agent",
          "category": "Coding agent"
        },
        {
          "id": "desktop-multi-agent-command-center",
          "name": "Desktop Multi-Agent Command Center",
          "category": "Agent orchestration"
        }
      ],
      "sources": [
        {
          "label": "Ollama library",
          "url": "https://ollama.com/library/qwen2.5-coder"
        }
      ],
      "last_verified": "2026-05-19",
      "confidence": "high"
    },
    {
      "id": "qwen3-8b",
      "model_name": "Qwen3 8B",
      "family": "Qwen",
      "provider": "Alibaba",
      "license_type": "open-weight",
      "parameters_billions": 8,
      "architecture": "dense",
      "ollama_tag": "qwen3:8b",
      "recommended_quantization": "Q4_K_M",
      "best_for": [
        "chat",
        "coding",
        "agents/tool workflows",
        "reasoning",
        "Fast general local assistant with reasoning/coding balance"
      ],
      "weak_for": [
        "vision/image understanding",
        "Less capable than 14B/32B models for large tasks"
      ],
      "not_for": [
        "image understanding / multimodal vision"
      ],
      "agent_readiness_score": 86,
      "context": {
        "minimum_context_tokens": 8192,
        "typical_context_tokens": 40000,
        "context_fit_note": "Good practical context range for coding, chat, and moderate RAG."
      },
      "memory_estimates_gb": {
        "quantization": {
          "q4": 6,
          "q5": 7.5,
          "q8": 12,
          "fp16": 24
        },
        "q4_context_profiles": {
          "quick": 5.4,
          "coding": 6,
          "repo": 6.9,
          "rag": 8.1,
          "agent": 8.7,
          "extreme": 10.2
        },
        "caveat": "Estimates are practical planning estimates. Real use varies by backend, KV cache, context length, drivers, quantization file, and offloading."
      },
      "hardware": {
        "minimum_vram_gb_q4": 8,
        "comfortable_vram_gb_q4": 12,
        "recommended_gpu": "RTX 4060 8GB or better",
        "local_fit_note": "Good small-local-model candidate for 8GB+ GPUs at Q4."
      },
      "routing_guidance": {
        "green": "Prefer local inference first if the selected GPU has comfortable headroom.",
        "yellow": "Reduce context, use lower quantization, try a smaller model, or expect RAM/offload slowdown.",
        "red": "Use a smaller local model, larger GPU, cloud GPU fallback, or compare API/cloud costs.",
        "commercial_option_ids": [],
        "ranking_influenced_by_affiliate": false
      },
      "recommended_applications": [
        {
          "id": "hermes-kellerrechner-agent",
          "name": "Hermes / Kellerrechner Agent",
          "category": "Personal AI assistant"
        },
        {
          "id": "telegram-ai-bot",
          "name": "Telegram AI Bot",
          "category": "Chat and automation bot"
        },
        {
          "id": "knowledge-vault-agent",
          "name": "Obsidian / Knowledge Vault Agent",
          "category": "Knowledge and memory agent"
        },
        {
          "id": "social-publishing-approval-worker",
          "name": "Social Publishing Approval Worker",
          "category": "Marketing automation agent"
        }
      ],
      "sources": [
        {
          "label": "Ollama library",
          "url": "https://ollama.com/library/qwen3"
        }
      ],
      "last_verified": "2026-05-19",
      "confidence": "medium"
    },
    {
      "id": "qwen3-coder-30b",
      "model_name": "Qwen3-Coder 30B-A3B",
      "family": "Qwen3-Coder",
      "provider": "Alibaba",
      "license_type": "open-weight",
      "parameters_billions": 30,
      "architecture": "moe",
      "ollama_tag": "qwen3-coder:30b",
      "recommended_quantization": "Q4_K_M",
      "best_for": [
        "chat",
        "coding",
        "agents/tool workflows",
        "reasoning",
        "Agentic coding, repository-scale local code review, and tool-heavy development loops"
      ],
      "weak_for": [
        "vision/image understanding",
        "Long context leaves little headroom on single 24GB GPUs"
      ],
      "not_for": [
        "image understanding / multimodal vision"
      ],
      "agent_readiness_score": 94,
      "context": {
        "minimum_context_tokens": 32768,
        "typical_context_tokens": 256000,
        "context_fit_note": "Large context-capable on paper; verify runtime memory and backend limits."
      },
      "memory_estimates_gb": {
        "quantization": {
          "q4": 19,
          "q5": 23.75,
          "q8": 38,
          "fp16": 76
        },
        "q4_context_profiles": {
          "quick": 17.1,
          "coding": 19,
          "repo": 21.85,
          "rag": 25.65,
          "agent": 27.55,
          "extreme": 32.3
        },
        "caveat": "Estimates are practical planning estimates. Real use varies by backend, KV cache, context length, drivers, quantization file, and offloading."
      },
      "hardware": {
        "minimum_vram_gb_q4": 24,
        "comfortable_vram_gb_q4": 32,
        "recommended_gpu": "RTX 3090/4090 24GB minimum, 32GB+ preferred",
        "local_fit_note": "Workstation-local candidate. Prefer 32GB+ VRAM for agents or long context."
      },
      "routing_guidance": {
        "green": "Prefer local inference first if the selected GPU has comfortable headroom.",
        "yellow": "Reduce context, use lower quantization, try a smaller model, or expect RAM/offload slowdown.",
        "red": "Use a smaller local model, larger GPU, cloud GPU fallback, or compare API/cloud costs.",
        "commercial_option_ids": [
          "apiroute-cloud-api-comparison"
        ],
        "ranking_influenced_by_affiliate": false
      },
      "recommended_applications": [],
      "sources": [
        {
          "label": "Ollama library",
          "url": "https://ollama.com/library/qwen3-coder"
        },
        {
          "label": "Qwen blog",
          "url": "https://qwenlm.github.io/blog/qwen3-coder/"
        }
      ],
      "last_verified": "2026-05-26",
      "confidence": "high"
    },
    {
      "id": "qwen3-5-9b",
      "model_name": "Qwen3.5 9B",
      "family": "Qwen3.5",
      "provider": "Alibaba",
      "license_type": "open-weight",
      "parameters_billions": 9,
      "architecture": "dense",
      "ollama_tag": "qwen3.5:9b",
      "recommended_quantization": "Q4_K_M",
      "best_for": [
        "chat",
        "coding",
        "agents/tool workflows",
        "vision/image understanding",
        "reasoning",
        "Modern multimodal local assistant, agent experiments, and coding support on mainstream GPUs"
      ],
      "weak_for": [
        "Still a small model for large repo-scale coding tasks"
      ],
      "not_for": [],
      "agent_readiness_score": 88,
      "context": {
        "minimum_context_tokens": 32768,
        "typical_context_tokens": 256000,
        "context_fit_note": "Large context-capable on paper; verify runtime memory and backend limits."
      },
      "memory_estimates_gb": {
        "quantization": {
          "q4": 6.6,
          "q5": 8.25,
          "q8": 13.2,
          "fp16": 26.4
        },
        "q4_context_profiles": {
          "quick": 5.94,
          "coding": 6.6,
          "repo": 7.59,
          "rag": 8.91,
          "agent": 9.57,
          "extreme": 11.22
        },
        "caveat": "Estimates are practical planning estimates. Real use varies by backend, KV cache, context length, drivers, quantization file, and offloading."
      },
      "hardware": {
        "minimum_vram_gb_q4": 8,
        "comfortable_vram_gb_q4": 12,
        "recommended_gpu": "RTX 4060 8GB minimum, RTX 3060 12GB preferred",
        "local_fit_note": "Good small-local-model candidate for 8GB+ GPUs at Q4."
      },
      "routing_guidance": {
        "green": "Prefer local inference first if the selected GPU has comfortable headroom.",
        "yellow": "Reduce context, use lower quantization, try a smaller model, or expect RAM/offload slowdown.",
        "red": "Use a smaller local model, larger GPU, cloud GPU fallback, or compare API/cloud costs.",
        "commercial_option_ids": [],
        "ranking_influenced_by_affiliate": false
      },
      "recommended_applications": [],
      "sources": [
        {
          "label": "Ollama library",
          "url": "https://ollama.com/library/qwen3.5"
        },
        {
          "label": "Qwen research index",
          "url": "https://qwen.ai/research"
        }
      ],
      "last_verified": "2026-05-26",
      "confidence": "medium"
    },
    {
      "id": "qwen3-5-27b",
      "model_name": "Qwen3.5 27B",
      "family": "Qwen3.5",
      "provider": "Alibaba",
      "license_type": "open-weight",
      "parameters_billions": 27,
      "architecture": "dense",
      "ollama_tag": "qwen3.5:27b",
      "recommended_quantization": "Q4_K_M",
      "best_for": [
        "chat",
        "coding",
        "agents/tool workflows",
        "vision/image understanding",
        "reasoning",
        "24GB-class multimodal agent, coding assistant, and reasoning workloads"
      ],
      "weak_for": [
        "Long multimodal context can exceed single 24GB headroom"
      ],
      "not_for": [],
      "agent_readiness_score": 92,
      "context": {
        "minimum_context_tokens": 32768,
        "typical_context_tokens": 256000,
        "context_fit_note": "Large context-capable on paper; verify runtime memory and backend limits."
      },
      "memory_estimates_gb": {
        "quantization": {
          "q4": 17,
          "q5": 21.25,
          "q8": 34,
          "fp16": 68
        },
        "q4_context_profiles": {
          "quick": 15.3,
          "coding": 17,
          "repo": 19.55,
          "rag": 22.95,
          "agent": 24.65,
          "extreme": 28.9
        },
        "caveat": "Estimates are practical planning estimates. Real use varies by backend, KV cache, context length, drivers, quantization file, and offloading."
      },
      "hardware": {
        "minimum_vram_gb_q4": 24,
        "comfortable_vram_gb_q4": 32,
        "recommended_gpu": "RTX 3090/4090 24GB minimum, 32GB+ preferred",
        "local_fit_note": "Workstation-local candidate. Prefer 32GB+ VRAM for agents or long context."
      },
      "routing_guidance": {
        "green": "Prefer local inference first if the selected GPU has comfortable headroom.",
        "yellow": "Reduce context, use lower quantization, try a smaller model, or expect RAM/offload slowdown.",
        "red": "Use a smaller local model, larger GPU, cloud GPU fallback, or compare API/cloud costs.",
        "commercial_option_ids": [
          "apiroute-cloud-api-comparison"
        ],
        "ranking_influenced_by_affiliate": false
      },
      "recommended_applications": [],
      "sources": [
        {
          "label": "Ollama library",
          "url": "https://ollama.com/library/qwen3.5"
        },
        {
          "label": "Qwen research index",
          "url": "https://qwen.ai/research"
        }
      ],
      "last_verified": "2026-05-26",
      "confidence": "medium"
    },
    {
      "id": "deepseek-r1-distill-qwen-8b",
      "model_name": "DeepSeek R1 Distill Qwen 8B",
      "family": "DeepSeek R1 Distill",
      "provider": "DeepSeek",
      "license_type": "open-weight",
      "parameters_billions": 8,
      "architecture": "dense",
      "ollama_tag": "deepseek-r1:8b",
      "recommended_quantization": "Q4_K_M",
      "best_for": [
        "chat",
        "coding",
        "reasoning",
        "Local reasoning experiments and step-by-step technical analysis"
      ],
      "weak_for": [
        "agents/tool workflows",
        "vision/image understanding",
        "Verbose reasoning can slow simple agent workflows"
      ],
      "not_for": [
        "image understanding / multimodal vision",
        "long-running tool agents"
      ],
      "agent_readiness_score": 72,
      "context": {
        "minimum_context_tokens": 8192,
        "typical_context_tokens": 128000,
        "context_fit_note": "Large context-capable on paper; verify runtime memory and backend limits."
      },
      "memory_estimates_gb": {
        "quantization": {
          "q4": 6,
          "q5": 7.5,
          "q8": 12,
          "fp16": 24
        },
        "q4_context_profiles": {
          "quick": 5.4,
          "coding": 6,
          "repo": 6.9,
          "rag": 8.1,
          "agent": 8.7,
          "extreme": 10.2
        },
        "caveat": "Estimates are practical planning estimates. Real use varies by backend, KV cache, context length, drivers, quantization file, and offloading."
      },
      "hardware": {
        "minimum_vram_gb_q4": 8,
        "comfortable_vram_gb_q4": 12,
        "recommended_gpu": "RTX 4060 8GB or better",
        "local_fit_note": "Good small-local-model candidate for 8GB+ GPUs at Q4."
      },
      "routing_guidance": {
        "green": "Prefer local inference first if the selected GPU has comfortable headroom.",
        "yellow": "Reduce context, use lower quantization, try a smaller model, or expect RAM/offload slowdown.",
        "red": "Use a smaller local model, larger GPU, cloud GPU fallback, or compare API/cloud costs.",
        "commercial_option_ids": [],
        "ranking_influenced_by_affiliate": false
      },
      "recommended_applications": [],
      "sources": [
        {
          "label": "Ollama library",
          "url": "https://ollama.com/library/deepseek-r1"
        }
      ],
      "last_verified": "2026-05-19",
      "confidence": "high"
    },
    {
      "id": "deepseek-r1-distill-qwen-14b",
      "model_name": "DeepSeek R1 Distill Qwen 14B",
      "family": "DeepSeek R1 Distill",
      "provider": "DeepSeek",
      "license_type": "open-weight",
      "parameters_billions": 14,
      "architecture": "dense",
      "ollama_tag": "deepseek-r1:14b",
      "recommended_quantization": "Q4_K_M",
      "best_for": [
        "chat",
        "coding",
        "reasoning",
        "Local reasoning and debugging on 12GB/16GB GPUs"
      ],
      "weak_for": [
        "agents/tool workflows",
        "vision/image understanding",
        "Less ergonomic for fast Telegram-style assistant responses"
      ],
      "not_for": [
        "image understanding / multimodal vision",
        "long-running tool agents"
      ],
      "agent_readiness_score": 76,
      "context": {
        "minimum_context_tokens": 8192,
        "typical_context_tokens": 128000,
        "context_fit_note": "Large context-capable on paper; verify runtime memory and backend limits."
      },
      "memory_estimates_gb": {
        "quantization": {
          "q4": 10.5,
          "q5": 13.13,
          "q8": 21,
          "fp16": 42
        },
        "q4_context_profiles": {
          "quick": 9.45,
          "coding": 10.5,
          "repo": 12.07,
          "rag": 14.18,
          "agent": 15.22,
          "extreme": 17.85
        },
        "caveat": "Estimates are practical planning estimates. Real use varies by backend, KV cache, context length, drivers, quantization file, and offloading."
      },
      "hardware": {
        "minimum_vram_gb_q4": 12,
        "comfortable_vram_gb_q4": 16,
        "recommended_gpu": "RTX 3060 12GB minimum, 16GB+ preferred",
        "local_fit_note": "Practical 12GB local-agent candidate at Q4 with headroom checks."
      },
      "routing_guidance": {
        "green": "Prefer local inference first if the selected GPU has comfortable headroom.",
        "yellow": "Reduce context, use lower quantization, try a smaller model, or expect RAM/offload slowdown.",
        "red": "Use a smaller local model, larger GPU, cloud GPU fallback, or compare API/cloud costs.",
        "commercial_option_ids": [],
        "ranking_influenced_by_affiliate": false
      },
      "recommended_applications": [],
      "sources": [
        {
          "label": "Ollama library",
          "url": "https://ollama.com/library/deepseek-r1"
        }
      ],
      "last_verified": "2026-05-19",
      "confidence": "high"
    },
    {
      "id": "deepseek-r1-distill-qwen-32b",
      "model_name": "DeepSeek R1 Distill Qwen 32B",
      "family": "DeepSeek R1 Distill",
      "provider": "DeepSeek",
      "license_type": "open-weight",
      "parameters_billions": 32,
      "architecture": "dense",
      "ollama_tag": "deepseek-r1:32b",
      "recommended_quantization": "Q4_K_M",
      "best_for": [
        "chat",
        "coding",
        "reasoning",
        "Heavy local reasoning on 24GB GPUs"
      ],
      "weak_for": [
        "agents/tool workflows",
        "vision/image understanding",
        "Tight VRAM headroom and slower agent loops"
      ],
      "not_for": [
        "image understanding / multimodal vision",
        "long-running tool agents"
      ],
      "agent_readiness_score": 78,
      "context": {
        "minimum_context_tokens": 8192,
        "typical_context_tokens": 128000,
        "context_fit_note": "Large context-capable on paper; verify runtime memory and backend limits."
      },
      "memory_estimates_gb": {
        "quantization": {
          "q4": 21,
          "q5": 26.25,
          "q8": 42,
          "fp16": 84
        },
        "q4_context_profiles": {
          "quick": 18.9,
          "coding": 21,
          "repo": 24.15,
          "rag": 28.35,
          "agent": 30.45,
          "extreme": 35.7
        },
        "caveat": "Estimates are practical planning estimates. Real use varies by backend, KV cache, context length, drivers, quantization file, and offloading."
      },
      "hardware": {
        "minimum_vram_gb_q4": 24,
        "comfortable_vram_gb_q4": 32,
        "recommended_gpu": "RTX 3090/4090 24GB minimum, 32GB+ preferred",
        "local_fit_note": "Workstation-local candidate. Prefer 32GB+ VRAM for agents or long context."
      },
      "routing_guidance": {
        "green": "Prefer local inference first if the selected GPU has comfortable headroom.",
        "yellow": "Reduce context, use lower quantization, try a smaller model, or expect RAM/offload slowdown.",
        "red": "Use a smaller local model, larger GPU, cloud GPU fallback, or compare API/cloud costs.",
        "commercial_option_ids": [
          "apiroute-cloud-api-comparison"
        ],
        "ranking_influenced_by_affiliate": false
      },
      "recommended_applications": [
        {
          "id": "desktop-multi-agent-command-center",
          "name": "Desktop Multi-Agent Command Center",
          "category": "Agent orchestration"
        }
      ],
      "sources": [
        {
          "label": "Ollama library",
          "url": "https://ollama.com/library/deepseek-r1"
        }
      ],
      "last_verified": "2026-05-19",
      "confidence": "medium"
    },
    {
      "id": "gemma-3-4b",
      "model_name": "Gemma 3 4B",
      "family": "Gemma",
      "provider": "Google",
      "license_type": "open-weight",
      "parameters_billions": 4,
      "architecture": "dense",
      "ollama_tag": "gemma3:4b",
      "recommended_quantization": "Q4_K_M",
      "best_for": [
        "chat",
        "agents/tool workflows",
        "vision/image understanding",
        "Small multimodal local assistant and low-resource setups"
      ],
      "weak_for": [
        "coding",
        "reasoning",
        "Limited quality for coding and complex tasks"
      ],
      "not_for": [
        "coding-heavy workloads",
        "deep multi-step reasoning"
      ],
      "agent_readiness_score": 70,
      "context": {
        "minimum_context_tokens": 8192,
        "typical_context_tokens": 128000,
        "context_fit_note": "Large context-capable on paper; verify runtime memory and backend limits."
      },
      "memory_estimates_gb": {
        "quantization": {
          "q4": 3.5,
          "q5": 4.38,
          "q8": 7,
          "fp16": 14
        },
        "q4_context_profiles": {
          "quick": 3.15,
          "coding": 3.5,
          "repo": 4.02,
          "rag": 4.73,
          "agent": 5.08,
          "extreme": 5.95
        },
        "caveat": "Estimates are practical planning estimates. Real use varies by backend, KV cache, context length, drivers, quantization file, and offloading."
      },
      "hardware": {
        "minimum_vram_gb_q4": 6,
        "comfortable_vram_gb_q4": 8,
        "recommended_gpu": "6GB+ VRAM GPU",
        "local_fit_note": "Good small-local-model candidate for 8GB+ GPUs at Q4."
      },
      "routing_guidance": {
        "green": "Prefer local inference first if the selected GPU has comfortable headroom.",
        "yellow": "Reduce context, use lower quantization, try a smaller model, or expect RAM/offload slowdown.",
        "red": "Use a smaller local model, larger GPU, cloud GPU fallback, or compare API/cloud costs.",
        "commercial_option_ids": [],
        "ranking_influenced_by_affiliate": false
      },
      "recommended_applications": [],
      "sources": [
        {
          "label": "Ollama library",
          "url": "https://ollama.com/library/gemma3"
        }
      ],
      "last_verified": "2026-05-19",
      "confidence": "high"
    },
    {
      "id": "gemma-3-12b",
      "model_name": "Gemma 3 12B",
      "family": "Gemma",
      "provider": "Google",
      "license_type": "open-weight",
      "parameters_billions": 12,
      "architecture": "dense",
      "ollama_tag": "gemma3:12b",
      "recommended_quantization": "Q4_K_M",
      "best_for": [
        "chat",
        "agents/tool workflows",
        "vision/image understanding",
        "reasoning",
        "Balanced multimodal local chat on 12GB+ GPUs"
      ],
      "weak_for": [
        "coding",
        "Not primarily a coding model"
      ],
      "not_for": [
        "coding-heavy workloads"
      ],
      "agent_readiness_score": 78,
      "context": {
        "minimum_context_tokens": 8192,
        "typical_context_tokens": 128000,
        "context_fit_note": "Large context-capable on paper; verify runtime memory and backend limits."
      },
      "memory_estimates_gb": {
        "quantization": {
          "q4": 9,
          "q5": 11.25,
          "q8": 18,
          "fp16": 36
        },
        "q4_context_profiles": {
          "quick": 8.1,
          "coding": 9,
          "repo": 10.35,
          "rag": 12.15,
          "agent": 13.05,
          "extreme": 15.3
        },
        "caveat": "Estimates are practical planning estimates. Real use varies by backend, KV cache, context length, drivers, quantization file, and offloading."
      },
      "hardware": {
        "minimum_vram_gb_q4": 12,
        "comfortable_vram_gb_q4": 16,
        "recommended_gpu": "RTX 3060 12GB or better",
        "local_fit_note": "Practical 12GB local-agent candidate at Q4 with headroom checks."
      },
      "routing_guidance": {
        "green": "Prefer local inference first if the selected GPU has comfortable headroom.",
        "yellow": "Reduce context, use lower quantization, try a smaller model, or expect RAM/offload slowdown.",
        "red": "Use a smaller local model, larger GPU, cloud GPU fallback, or compare API/cloud costs.",
        "commercial_option_ids": [],
        "ranking_influenced_by_affiliate": false
      },
      "recommended_applications": [],
      "sources": [
        {
          "label": "Ollama library",
          "url": "https://ollama.com/library/gemma3"
        }
      ],
      "last_verified": "2026-05-19",
      "confidence": "high"
    },
    {
      "id": "gemma-3-27b",
      "model_name": "Gemma 3 27B",
      "family": "Gemma",
      "provider": "Google",
      "license_type": "open-weight",
      "parameters_billions": 27,
      "architecture": "dense",
      "ollama_tag": "gemma3:27b",
      "recommended_quantization": "Q4_K_M",
      "best_for": [
        "chat",
        "agents/tool workflows",
        "vision/image understanding",
        "reasoning",
        "High-quality multimodal local assistant on 24GB GPUs"
      ],
      "weak_for": [
        "coding",
        "Less specialized for code than Qwen Coder"
      ],
      "not_for": [
        "coding-heavy workloads"
      ],
      "agent_readiness_score": 84,
      "context": {
        "minimum_context_tokens": 8192,
        "typical_context_tokens": 128000,
        "context_fit_note": "Large context-capable on paper; verify runtime memory and backend limits."
      },
      "memory_estimates_gb": {
        "quantization": {
          "q4": 18,
          "q5": 22.5,
          "q8": 36,
          "fp16": 72
        },
        "q4_context_profiles": {
          "quick": 16.2,
          "coding": 18,
          "repo": 20.7,
          "rag": 24.3,
          "agent": 26.1,
          "extreme": 30.6
        },
        "caveat": "Estimates are practical planning estimates. Real use varies by backend, KV cache, context length, drivers, quantization file, and offloading."
      },
      "hardware": {
        "minimum_vram_gb_q4": 24,
        "comfortable_vram_gb_q4": 32,
        "recommended_gpu": "RTX 3090/4090 24GB or better",
        "local_fit_note": "Workstation-local candidate. Prefer 32GB+ VRAM for agents or long context."
      },
      "routing_guidance": {
        "green": "Prefer local inference first if the selected GPU has comfortable headroom.",
        "yellow": "Reduce context, use lower quantization, try a smaller model, or expect RAM/offload slowdown.",
        "red": "Use a smaller local model, larger GPU, cloud GPU fallback, or compare API/cloud costs.",
        "commercial_option_ids": [
          "apiroute-cloud-api-comparison"
        ],
        "ranking_influenced_by_affiliate": false
      },
      "recommended_applications": [],
      "sources": [
        {
          "label": "Ollama library",
          "url": "https://ollama.com/library/gemma3"
        }
      ],
      "last_verified": "2026-05-19",
      "confidence": "high"
    },
    {
      "id": "gemma-4-e4b",
      "model_name": "Gemma 4 E4B",
      "family": "Gemma",
      "provider": "Google",
      "license_type": "open-weight",
      "parameters_billions": 4,
      "architecture": "moe",
      "ollama_tag": "gemma4:e4b",
      "recommended_quantization": "Q4_K_M",
      "best_for": [
        "chat",
        "coding",
        "agents/tool workflows",
        "vision/image understanding",
        "reasoning",
        "Efficient multimodal local assistant and edge-style agent workflows"
      ],
      "weak_for": [
        "Smaller effective model; not ideal for deep repository-scale coding"
      ],
      "not_for": [],
      "agent_readiness_score": 83,
      "context": {
        "minimum_context_tokens": 32768,
        "typical_context_tokens": 128000,
        "context_fit_note": "Large context-capable on paper; verify runtime memory and backend limits."
      },
      "memory_estimates_gb": {
        "quantization": {
          "q4": 9.6,
          "q5": 12,
          "q8": 19.2,
          "fp16": 38.4
        },
        "q4_context_profiles": {
          "quick": 8.64,
          "coding": 9.6,
          "repo": 11.04,
          "rag": 12.96,
          "agent": 13.92,
          "extreme": 16.32
        },
        "caveat": "Estimates are practical planning estimates. Real use varies by backend, KV cache, context length, drivers, quantization file, and offloading."
      },
      "hardware": {
        "minimum_vram_gb_q4": 12,
        "comfortable_vram_gb_q4": 16,
        "recommended_gpu": "RTX 3060 12GB minimum, 16GB+ preferred",
        "local_fit_note": "Practical 12GB local-agent candidate at Q4 with headroom checks."
      },
      "routing_guidance": {
        "green": "Prefer local inference first if the selected GPU has comfortable headroom.",
        "yellow": "Reduce context, use lower quantization, try a smaller model, or expect RAM/offload slowdown.",
        "red": "Use a smaller local model, larger GPU, cloud GPU fallback, or compare API/cloud costs.",
        "commercial_option_ids": [],
        "ranking_influenced_by_affiliate": false
      },
      "recommended_applications": [],
      "sources": [
        {
          "label": "Ollama library",
          "url": "https://ollama.com/library/gemma4"
        },
        {
          "label": "Google DeepMind Gemma 4",
          "url": "https://deepmind.google/models/gemma/gemma-4/"
        }
      ],
      "last_verified": "2026-05-26",
      "confidence": "high"
    },
    {
      "id": "gemma-4-31b",
      "model_name": "Gemma 4 31B",
      "family": "Gemma",
      "provider": "Google",
      "license_type": "open-weight",
      "parameters_billions": 31,
      "architecture": "dense",
      "ollama_tag": "gemma4:31b",
      "recommended_quantization": "Q4_K_M",
      "best_for": [
        "chat",
        "coding",
        "agents/tool workflows",
        "vision/image understanding",
        "reasoning",
        "High-quality multimodal reasoning, coding assistants, and local-first agent workflows"
      ],
      "weak_for": [
        "Single 24GB GPUs have limited headroom for long context"
      ],
      "not_for": [],
      "agent_readiness_score": 91,
      "context": {
        "minimum_context_tokens": 32768,
        "typical_context_tokens": 256000,
        "context_fit_note": "Large context-capable on paper; verify runtime memory and backend limits."
      },
      "memory_estimates_gb": {
        "quantization": {
          "q4": 20,
          "q5": 25,
          "q8": 40,
          "fp16": 80
        },
        "q4_context_profiles": {
          "quick": 18,
          "coding": 20,
          "repo": 23,
          "rag": 27,
          "agent": 29,
          "extreme": 34
        },
        "caveat": "Estimates are practical planning estimates. Real use varies by backend, KV cache, context length, drivers, quantization file, and offloading."
      },
      "hardware": {
        "minimum_vram_gb_q4": 24,
        "comfortable_vram_gb_q4": 32,
        "recommended_gpu": "RTX 3090/4090 24GB minimum, 32GB+ preferred",
        "local_fit_note": "Workstation-local candidate. Prefer 32GB+ VRAM for agents or long context."
      },
      "routing_guidance": {
        "green": "Prefer local inference first if the selected GPU has comfortable headroom.",
        "yellow": "Reduce context, use lower quantization, try a smaller model, or expect RAM/offload slowdown.",
        "red": "Use a smaller local model, larger GPU, cloud GPU fallback, or compare API/cloud costs.",
        "commercial_option_ids": [
          "apiroute-cloud-api-comparison"
        ],
        "ranking_influenced_by_affiliate": false
      },
      "recommended_applications": [],
      "sources": [
        {
          "label": "Ollama library",
          "url": "https://ollama.com/library/gemma4"
        },
        {
          "label": "Google DeepMind Gemma 4",
          "url": "https://deepmind.google/models/gemma/gemma-4/"
        }
      ],
      "last_verified": "2026-05-26",
      "confidence": "high"
    },
    {
      "id": "mistral-7b",
      "model_name": "Mistral 7B",
      "family": "Mistral",
      "provider": "Mistral AI",
      "license_type": "open-weight",
      "parameters_billions": 7,
      "architecture": "dense",
      "ollama_tag": "mistral:7b",
      "recommended_quantization": "Q4_K_M",
      "best_for": [
        "chat",
        "agents/tool workflows",
        "Fast local chat and simple agent tasks"
      ],
      "weak_for": [
        "coding",
        "vision/image understanding",
        "reasoning",
        "Older/smaller than newer Qwen/Gemma alternatives"
      ],
      "not_for": [
        "image understanding / multimodal vision",
        "coding-heavy workloads",
        "deep multi-step reasoning"
      ],
      "agent_readiness_score": 74,
      "context": {
        "minimum_context_tokens": 8192,
        "typical_context_tokens": 32768,
        "context_fit_note": "Good practical context range for coding, chat, and moderate RAG."
      },
      "memory_estimates_gb": {
        "quantization": {
          "q4": 5.5,
          "q5": 6.88,
          "q8": 11,
          "fp16": 22
        },
        "q4_context_profiles": {
          "quick": 4.95,
          "coding": 5.5,
          "repo": 6.32,
          "rag": 7.43,
          "agent": 7.97,
          "extreme": 9.35
        },
        "caveat": "Estimates are practical planning estimates. Real use varies by backend, KV cache, context length, drivers, quantization file, and offloading."
      },
      "hardware": {
        "minimum_vram_gb_q4": 8,
        "comfortable_vram_gb_q4": 12,
        "recommended_gpu": "RTX 4060 8GB or better",
        "local_fit_note": "Good small-local-model candidate for 8GB+ GPUs at Q4."
      },
      "routing_guidance": {
        "green": "Prefer local inference first if the selected GPU has comfortable headroom.",
        "yellow": "Reduce context, use lower quantization, try a smaller model, or expect RAM/offload slowdown.",
        "red": "Use a smaller local model, larger GPU, cloud GPU fallback, or compare API/cloud costs.",
        "commercial_option_ids": [],
        "ranking_influenced_by_affiliate": false
      },
      "recommended_applications": [
        {
          "id": "telegram-ai-bot",
          "name": "Telegram AI Bot",
          "category": "Chat and automation bot"
        }
      ],
      "sources": [
        {
          "label": "Ollama library",
          "url": "https://ollama.com/library/mistral"
        }
      ],
      "last_verified": "2026-05-19",
      "confidence": "high"
    },
    {
      "id": "devstral-small-2-24b",
      "model_name": "Devstral Small 2 24B",
      "family": "Devstral",
      "provider": "Mistral AI",
      "license_type": "open-weight",
      "parameters_billions": 24,
      "architecture": "dense",
      "ollama_tag": "devstral-small-2",
      "recommended_quantization": "Q4_K_M",
      "best_for": [
        "chat",
        "coding",
        "agents/tool workflows",
        "vision/image understanding",
        "reasoning",
        "Software engineering agents, repo navigation, patch planning, and local coding workflows"
      ],
      "weak_for": [
        "Large-context coding work is tight below 24GB VRAM"
      ],
      "not_for": [],
      "agent_readiness_score": 91,
      "context": {
        "minimum_context_tokens": 32768,
        "typical_context_tokens": 384000,
        "context_fit_note": "Large context-capable on paper; verify runtime memory and backend limits."
      },
      "memory_estimates_gb": {
        "quantization": {
          "q4": 15,
          "q5": 18.75,
          "q8": 30,
          "fp16": 60
        },
        "q4_context_profiles": {
          "quick": 13.5,
          "coding": 15,
          "repo": 17.25,
          "rag": 20.25,
          "agent": 21.75,
          "extreme": 25.5
        },
        "caveat": "Estimates are practical planning estimates. Real use varies by backend, KV cache, context length, drivers, quantization file, and offloading."
      },
      "hardware": {
        "minimum_vram_gb_q4": 16,
        "comfortable_vram_gb_q4": 24,
        "recommended_gpu": "16GB VRAM minimum, 24GB+ preferred",
        "local_fit_note": "Workstation-local candidate. Prefer 24GB+ VRAM for agents or long context."
      },
      "routing_guidance": {
        "green": "Prefer local inference first if the selected GPU has comfortable headroom.",
        "yellow": "Reduce context, use lower quantization, try a smaller model, or expect RAM/offload slowdown.",
        "red": "Use a smaller local model, larger GPU, cloud GPU fallback, or compare API/cloud costs.",
        "commercial_option_ids": [
          "apiroute-cloud-api-comparison"
        ],
        "ranking_influenced_by_affiliate": false
      },
      "recommended_applications": [],
      "sources": [
        {
          "label": "Ollama library",
          "url": "https://ollama.com/library/devstral-small-2"
        },
        {
          "label": "Mistral AI model card",
          "url": "https://docs.mistral.ai/models/model-cards/devstral-small-2-25-12"
        },
        {
          "label": "Mistral AI announcement",
          "url": "https://mistral.ai/news/devstral-2-vibe-cli"
        }
      ],
      "last_verified": "2026-05-26",
      "confidence": "high"
    },
    {
      "id": "mixtral-8x7b",
      "model_name": "Mixtral 8x7B",
      "family": "Mixtral",
      "provider": "Mistral AI",
      "license_type": "open-weight",
      "parameters_billions": 46.7,
      "architecture": "moe",
      "ollama_tag": "mixtral:8x7b",
      "recommended_quantization": "Q4_K_M",
      "best_for": [
        "chat",
        "coding",
        "agents/tool workflows",
        "reasoning",
        "MoE local reasoning/chat when enough VRAM is available"
      ],
      "weak_for": [
        "vision/image understanding",
        "Not practical for 24GB single-GPU setups without offload"
      ],
      "not_for": [
        "image understanding / multimodal vision"
      ],
      "agent_readiness_score": 82,
      "context": {
        "minimum_context_tokens": 8192,
        "typical_context_tokens": 32768,
        "context_fit_note": "Good practical context range for coding, chat, and moderate RAG."
      },
      "memory_estimates_gb": {
        "quantization": {
          "q4": 28,
          "q5": 35,
          "q8": 56,
          "fp16": 112
        },
        "q4_context_profiles": {
          "quick": 25.2,
          "coding": 28,
          "repo": 32.2,
          "rag": 37.8,
          "agent": 40.6,
          "extreme": 47.6
        },
        "caveat": "Estimates are practical planning estimates. Real use varies by backend, KV cache, context length, drivers, quantization file, and offloading."
      },
      "hardware": {
        "minimum_vram_gb_q4": 32,
        "comfortable_vram_gb_q4": 48,
        "recommended_gpu": "32GB+ VRAM or CPU/RAM offload",
        "local_fit_note": "Large local model. Prefer 48GB+ VRAM, multi-GPU, cloud GPU, or hosted API fallback."
      },
      "routing_guidance": {
        "green": "Prefer local inference first if the selected GPU has comfortable headroom.",
        "yellow": "Reduce context, use lower quantization, try a smaller model, or expect RAM/offload slowdown.",
        "red": "Use a smaller local model, larger GPU, cloud GPU fallback, or compare API/cloud costs.",
        "commercial_option_ids": [
          "runpod-cloud-gpu-fallback",
          "apiroute-cloud-api-comparison"
        ],
        "ranking_influenced_by_affiliate": false
      },
      "recommended_applications": [],
      "sources": [
        {
          "label": "Ollama library",
          "url": "https://ollama.com/library/mixtral"
        }
      ],
      "last_verified": "2026-05-19",
      "confidence": "high"
    },
    {
      "id": "phi-4-14b",
      "model_name": "Phi-4 14B",
      "family": "Phi",
      "provider": "Microsoft",
      "license_type": "open-weight",
      "parameters_billions": 14,
      "architecture": "dense",
      "ollama_tag": "phi4:14b",
      "recommended_quantization": "Q4_K_M",
      "best_for": [
        "chat",
        "coding",
        "agents/tool workflows",
        "reasoning",
        "Compact reasoning and technical assistant on 12GB/16GB GPUs"
      ],
      "weak_for": [
        "vision/image understanding",
        "Smaller ecosystem than Llama/Qwen families"
      ],
      "not_for": [
        "image understanding / multimodal vision"
      ],
      "agent_readiness_score": 82,
      "context": {
        "minimum_context_tokens": 8192,
        "typical_context_tokens": 16384,
        "context_fit_note": "Short-to-moderate context; avoid large document or whole-repo workflows."
      },
      "memory_estimates_gb": {
        "quantization": {
          "q4": 10.5,
          "q5": 13.13,
          "q8": 21,
          "fp16": 42
        },
        "q4_context_profiles": {
          "quick": 9.45,
          "coding": 10.5,
          "repo": 12.07,
          "rag": 14.18,
          "agent": 15.22,
          "extreme": 17.85
        },
        "caveat": "Estimates are practical planning estimates. Real use varies by backend, KV cache, context length, drivers, quantization file, and offloading."
      },
      "hardware": {
        "minimum_vram_gb_q4": 12,
        "comfortable_vram_gb_q4": 16,
        "recommended_gpu": "RTX 3060 12GB minimum, 16GB+ preferred",
        "local_fit_note": "Practical 12GB local-agent candidate at Q4 with headroom checks."
      },
      "routing_guidance": {
        "green": "Prefer local inference first if the selected GPU has comfortable headroom.",
        "yellow": "Reduce context, use lower quantization, try a smaller model, or expect RAM/offload slowdown.",
        "red": "Use a smaller local model, larger GPU, cloud GPU fallback, or compare API/cloud costs.",
        "commercial_option_ids": [],
        "ranking_influenced_by_affiliate": false
      },
      "recommended_applications": [
        {
          "id": "local-coding-agent",
          "name": "Local Coding Agent",
          "category": "Coding agent"
        }
      ],
      "sources": [
        {
          "label": "Ollama library",
          "url": "https://ollama.com/library/phi4"
        }
      ],
      "last_verified": "2026-05-19",
      "confidence": "high"
    },
    {
      "id": "gpt-oss-20b",
      "model_name": "gpt-oss 20B",
      "family": "gpt-oss",
      "provider": "OpenAI",
      "license_type": "open-weight",
      "parameters_billions": 20,
      "architecture": "moe",
      "ollama_tag": "gpt-oss:20b",
      "recommended_quantization": "MXFP4 native",
      "best_for": [
        "chat",
        "coding",
        "agents/tool workflows",
        "reasoning",
        "Local reasoning, agent planning, and tool-use workflows on 16GB+ GPUs"
      ],
      "weak_for": [
        "vision/image understanding",
        "12GB GPUs need offload or smaller fallback models"
      ],
      "not_for": [
        "image understanding / multimodal vision"
      ],
      "agent_readiness_score": 89,
      "context": {
        "minimum_context_tokens": 32768,
        "typical_context_tokens": 128000,
        "context_fit_note": "Large context-capable on paper; verify runtime memory and backend limits."
      },
      "memory_estimates_gb": {
        "quantization": {
          "q4": 14,
          "q5": 17.5,
          "q8": 28,
          "fp16": 56
        },
        "q4_context_profiles": {
          "quick": 12.6,
          "coding": 14,
          "repo": 16.1,
          "rag": 18.9,
          "agent": 20.3,
          "extreme": 23.8
        },
        "caveat": "Estimates are practical planning estimates. Real use varies by backend, KV cache, context length, drivers, quantization file, and offloading."
      },
      "hardware": {
        "minimum_vram_gb_q4": 16,
        "comfortable_vram_gb_q4": 24,
        "recommended_gpu": "16GB VRAM minimum, 24GB+ preferred",
        "local_fit_note": "Workstation-local candidate. Prefer 24GB+ VRAM for agents or long context."
      },
      "routing_guidance": {
        "green": "Prefer local inference first if the selected GPU has comfortable headroom.",
        "yellow": "Reduce context, use lower quantization, try a smaller model, or expect RAM/offload slowdown.",
        "red": "Use a smaller local model, larger GPU, cloud GPU fallback, or compare API/cloud costs.",
        "commercial_option_ids": [
          "apiroute-cloud-api-comparison"
        ],
        "ranking_influenced_by_affiliate": false
      },
      "recommended_applications": [],
      "sources": [
        {
          "label": "Ollama library",
          "url": "https://ollama.com/library/gpt-oss"
        },
        {
          "label": "OpenAI gpt-oss",
          "url": "https://cdn.openai.com/pdf/419b6906-9da6-406c-a19d-1bb078ac7637/oai_gpt-oss_model_card.pdf"
        }
      ],
      "last_verified": "2026-05-26",
      "confidence": "high"
    },
    {
      "id": "gpt-oss-120b",
      "model_name": "gpt-oss 120B",
      "family": "gpt-oss",
      "provider": "OpenAI",
      "license_type": "open-weight",
      "parameters_billions": 120,
      "architecture": "moe",
      "ollama_tag": "gpt-oss:120b",
      "recommended_quantization": "MXFP4 native",
      "best_for": [
        "chat",
        "coding",
        "agents/tool workflows",
        "reasoning",
        "Large local reasoning servers, heavy agent orchestration, and high-end homelab inference"
      ],
      "weak_for": [
        "vision/image understanding",
        "Not realistic for consumer single-GPU setups below 80GB-class memory"
      ],
      "not_for": [
        "image understanding / multimodal vision"
      ],
      "agent_readiness_score": 93,
      "context": {
        "minimum_context_tokens": 32768,
        "typical_context_tokens": 128000,
        "context_fit_note": "Large context-capable on paper; verify runtime memory and backend limits."
      },
      "memory_estimates_gb": {
        "quantization": {
          "q4": 65,
          "q5": 81.25,
          "q8": 130,
          "fp16": 260
        },
        "q4_context_profiles": {
          "quick": 58.5,
          "coding": 65,
          "repo": 74.75,
          "rag": 87.75,
          "agent": 94.25,
          "extreme": 110.5
        },
        "caveat": "Estimates are practical planning estimates. Real use varies by backend, KV cache, context length, drivers, quantization file, and offloading."
      },
      "hardware": {
        "minimum_vram_gb_q4": 80,
        "comfortable_vram_gb_q4": 96,
        "recommended_gpu": "80GB+ VRAM server GPU or multi-GPU setup",
        "local_fit_note": "Large local model. Prefer 48GB+ VRAM, multi-GPU, cloud GPU, or hosted API fallback."
      },
      "routing_guidance": {
        "green": "Prefer local inference first if the selected GPU has comfortable headroom.",
        "yellow": "Reduce context, use lower quantization, try a smaller model, or expect RAM/offload slowdown.",
        "red": "Use a smaller local model, larger GPU, cloud GPU fallback, or compare API/cloud costs.",
        "commercial_option_ids": [
          "runpod-cloud-gpu-fallback",
          "apiroute-cloud-api-comparison"
        ],
        "ranking_influenced_by_affiliate": false
      },
      "recommended_applications": [],
      "sources": [
        {
          "label": "Ollama library",
          "url": "https://ollama.com/library/gpt-oss"
        },
        {
          "label": "OpenAI gpt-oss",
          "url": "https://cdn.openai.com/pdf/419b6906-9da6-406c-a19d-1bb078ac7637/oai_gpt-oss_model_card.pdf"
        }
      ],
      "last_verified": "2026-05-26",
      "confidence": "high"
    }
  ]
}
