gpu-data.json

[
    {
        "GPU": "BC-250",
        "Tok/s": "26.89 -33.52 tokens/s",
        "TFLOPS": "",
        "Format": "GGUF",
        "Cost": "$20",
        "Loading Secs": "21.49secs",
        "2nd Load": "",
        "Context (max)s": "",
        "Context sent": "109 tokens",
        "VRAM": "",
        "TDP": "",
        "watts inference": "197W",
        "Watts idle(Loaded)": "85W* - 101W",
        "Watts idle (0B VRAM)": "85W* - 101W",
        "Notes": "* 101W stock on P4.00G Bios. 85W with oberon-governor Single node on APW3+ and 12V Delta blower fan."
    },
    {
        "GPU": "P102-100",
        "Tok/s": "22.62 tokens/s",
        "TFLOPS": "10.77 fp32",
        "Format": "GGUF",
        "Cost": "$40",
        "Loading Secs": "11.4secs",
        "2nd Load": "",
        "Context (max)s": "8192",
        "Context sent": "109 tokens",
        "VRAM": "9320MB",
        "TDP": "250W",
        "watts inference": "140-220W",
        "Watts idle(Loaded)": "9W",
        "Watts idle (0B VRAM)": "9W",
        "Notes": ""
    },
    {
        "GPU": "P104-100 Q6_K_L",
        "Tok/s": "16.92 tokens/s",
        "TFLOPS": "6.655 fp32",
        "Format": "GGUF",
        "Cost": "$30",
        "Loading Secs": "26.33secs",
        "2nd Load": "16.24secs",
        "Context (max)s": "8192",
        "Context sent": "109 tokens",
        "VRAM": "7362MB",
        "TDP": "180W",
        "watts inference": "85-155W",
        "Watts idle(Loaded)": "5W",
        "Watts idle (0B VRAM)": "5W",
        "Notes": ""
    },
    {
        "GPU": "M40",
        "Tok/s": "15.67 tokens/s",
        "TFLOPS": "6.832 fp32",
        "Format": "GGUF",
        "Cost": "$40",
        "Loading Secs": "23.44secs",
        "2nd Load": "2.4secs",
        "Context (max)s": "8192",
        "Context sent": "109 tokens",
        "VRAM": "9292MB",
        "TDP": "250W",
        "watts inference": "125-220W",
        "Watts idle(Loaded)": "62W",
        "Watts idle (0B VRAM)": "15W",
        "Notes": "CUDA error: CUDA-capable device(s) is/are busy or unavailable"
    },
    {
        "GPU": "GTX 1060 Q4_K_M",
        "Tok/s": "15.17 tokens/s",
        "TFLOPS": "4.375 fp32",
        "Format": "GGUF",
        "Cost": "",
        "Loading Secs": "",
        "2nd Load": "2.02secs",
        "Context (max)s": "4096",
        "Context sent": "109 tokens",
        "VRAM": "5278MB",
        "TDP": "120W",
        "watts inference": "65-120W",
        "Watts idle(Loaded)": "5W",
        "Watts idle (0B VRAM)": "5W",
        "Notes": ""
    },
    {
        "GPU": "GTX 1070 ti Q6_K_L",
        "Tok/s": "17.28 tokens/s",
        "TFLOPS": "8.186 fp32",
        "Format": "GGUF",
        "Cost": "$100",
        "Loading Secs": "19.70secs",
        "2nd Load": "3.55secs",
        "Context (max)s": "8192",
        "Context sent": "109 tokens",
        "VRAM": "7684MB***",
        "TDP": "180W",
        "watts inference": "90-170W",
        "Watts idle(Loaded)": "6W",
        "Watts idle (0B VRAM)": "6W",
        "Notes": "Meta-Llama-3.1-8B-Instruct-Q6_K_L.gguf"
    },
    {
        "GPU": "AMD Radeon Instinct MI25",
        "Tok/s": "soon..",
        "TFLOPS": "",
        "Format": "",
        "Cost": "",
        "Loading Secs": "",
        "2nd Load": "",
        "Context (max)s": "",
        "Context sent": "",
        "VRAM": "",
        "TDP": "",
        "watts inference": "",
        "Watts idle(Loaded)": "",
        "Watts idle (0B VRAM)": "",
        "Notes": ""
    },
    {
        "GPU": "P4",
        "Tok/s": "",
        "TFLOPS": "5.704 fp32",
        "Format": "GGUF",
        "Cost": "$100",
        "Loading Secs": "",
        "2nd Load": "",
        "Context (max)s": "8192",
        "Context sent": "109 tokens",
        "VRAM": "",
        "TDP": "75W",
        "watts inference": "",
        "Watts idle(Loaded)": "",
        "Watts idle (0B VRAM)": "",
        "Notes": ""
    },
    {
        "GPU": "P40",
        "Tok/s": "18.56 tokens/s",
        "TFLOPS": "11.76 fp32",
        "Format": "GGUF",
        "Cost": "$300",
        "Loading Secs": "",
        "2nd Load": "3.58secs**",
        "Context (max)s": "8192",
        "Context sent": "109 tokens",
        "VRAM": "9341MB",
        "TDP": "250W",
        "watts inference": "90-150W",
        "Watts idle(Loaded)": "50W",
        "Watts idle (0B VRAM)": "10W",
        "Notes": "same inference time with or without flash_attention. **NVME on another server"
    },
    {
        "GPU": "P100",
        "Tok/s": "21.48 tokens/s",
        "TFLOPS": "9.526 fp32",
        "Format": "GGUF",
        "Cost": "$150",
        "Loading Secs": "23.51secs",
        "2nd Load": "",
        "Context (max)s": "8192",
        "Context sent": "109 tokens",
        "VRAM": "9448MB",
        "TDP": "250W",
        "watts inference": "80-140W",
        "Watts idle(Loaded)": "33W",
        "Watts idle (0B VRAM)": "26W",
        "Notes": ""
    },
    {
        "GPU": "P100",
        "Tok/s": "29.58 tokens/s",
        "TFLOPS": "19.05 fp16",
        "Format": "EXL2",
        "Cost": "$150",
        "Loading Secs": "22.51secs",
        "2nd Load": "6.95secs",
        "Context (max)s": "8192",
        "Context sent": "109 tokens",
        "VRAM": "9458MB",
        "TDP": "250W",
        "watts inference": "95-150W",
        "Watts idle(Loaded)": "33W",
        "Watts idle (0B VRAM)": "26W",
        "Notes": "no_flash_attn=true"
    },
    {
        "GPU": "CMP 70HX Q6_K_L",
        "Tok/s": "12.8 tokens/s",
        "TFLOPS": "10.71 fp32",
        "Format": "GGUF",
        "Cost": "$150",
        "Loading Secs": "26.7secs",
        "2nd Load": "9secs",
        "Context (max)s": "8192",
        "Context sent": "109 tokens",
        "VRAM": "7693MB",
        "TDP": "220W",
        "watts inference": "80-100W",
        "Watts idle(Loaded)": "65W** 13W setting p-state 8",
        "Watts idle (0B VRAM)": "65W",
        "Notes": "Meta-Llama-3.1-8B-Instruct-Q6_K_L.gguf RISER"
    },
    {
        "GPU": "CMP 70HX Q6_K_L",
        "Tok/s": "16.47 tokens/s",
        "TFLOPS": "10.71 fp32",
        "Format": "GGUF/FA",
        "Cost": "$150",
        "Loading Secs": "26.78secs",
        "2nd Load": "9secs",
        "Context (max)s": "8192",
        "Context sent": "109 tokens",
        "VRAM": "7391MB",
        "TDP": "220W",
        "watts inference": "80-110W",
        "Watts idle(Loaded)": "65W",
        "Watts idle (0B VRAM)": "65W",
        "Notes": "flash_attention RISER"
    },
    {
        "GPU": "CMP 70HX 6bpw",
        "Tok/s": "25.12 tokens/s",
        "TFLOPS": "10.71 fp16",
        "Format": "EXL2",
        "Cost": "$150",
        "Loading Secs": "22.07secs",
        "2nd Load": "8.81secs",
        "Context (max)s": "8192",
        "Context sent": "109 tokens",
        "VRAM": "7653MB",
        "TDP": "220W",
        "watts inference": "70-110W",
        "Watts idle(Loaded)": "65W",
        "Watts idle (0B VRAM)": "65W",
        "Notes": "turboderp/Llama-3.1-8B-Instruct-exl2 at 6.0bpw no_flash_attn RISER"
    },
    {
        "GPU": "CMP 70HX 6bpw",
        "Tok/s": "30.08 tokens/s",
        "TFLOPS": "10.71 fp16",
        "Format": "EXL2/FA",
        "Cost": "$150",
        "Loading Secs": "22.22secs",
        "2nd Load": "13.14secs",
        "Context (max)s": "8192",
        "Context sent": "109 tokens",
        "VRAM": "7653MB",
        "TDP": "220W",
        "watts inference": "110W",
        "Watts idle(Loaded)": "65W",
        "Watts idle (0B VRAM)": "65W",
        "Notes": "turboderp/Llama-3.1-8B-Instruct-exl2:6.0bpw RISER"
    },
    {
        "GPU": "GTX 1080ti",
        "Tok/s": "22.80 tokens/s",
        "TFLOPS": "11.34 fp32",
        "Format": "GGUF",
        "Cost": "$160",
        "Loading Secs": "23.99secs",
        "2nd Load": "2.89secs",
        "Context (max)s": "8192",
        "Context sent": "109 tokens",
        "VRAM": "9332MB",
        "TDP": "250W",
        "watts inference": "120-200W",
        "Watts idle(Loaded)": "8W",
        "Watts idle (0B VRAM)": "8W",
        "Notes": "RISER"
    },
    {
        "GPU": "CMP 100-210",
        "Tok/s": "25.07 tokens/s",
        "TFLOPS": "11.75 fp32",
        "Format": "GGUF",
        "Cost": "$150",
        "Loading Secs": "39.98secs",
        "2nd Load": "",
        "Context (max)s": "8192",
        "Context sent": "109 tokens",
        "VRAM": "9461MB",
        "TDP": "250W",
        "watts inference": "80-130W",
        "Watts idle(Loaded)": "28W",
        "Watts idle (0B VRAM)": "24W",
        "Notes": "rope_freq_base=0, or coredump"
    },
    {
        "GPU": "CMP 100-210",
        "Tok/s": "40.66 tokens/s",
        "TFLOPS": "23.49 fp16",
        "Format": "EXL2",
        "Cost": "$150",
        "Loading Secs": "41.43secs",
        "2nd Load": "",
        "Context (max)s": "8192",
        "Context sent": "109 tokens",
        "VRAM": "9489MB",
        "TDP": "250W",
        "watts inference": "120-170W",
        "Watts idle(Loaded)": "28W",
        "Watts idle (0B VRAM)": "24W",
        "Notes": "no_flash_attn=true"
    },
    {
        "GPU": "RTX 3070 Q6_K_L",
        "Tok/s": "27.96 tokens/s",
        "TFLOPS": "20.31 fp32",
        "Format": "GGUF",
        "Cost": "$250",
        "Loading Secs": "",
        "2nd Load": "5.15secs",
        "Context (max)s": "8192",
        "Context sent": "109 tokens",
        "VRAM": "7765MB",
        "TDP": "240W",
        "watts inference": "145-165W",
        "Watts idle(Loaded)": "23W",
        "Watts idle (0B VRAM)": "15W",
        "Notes": ""
    },
    {
        "GPU": "RTX 3070 Q6_K_L",
        "Tok/s": "29.63 tokens/s",
        "TFLOPS": "20.31 fp32",
        "Format": "GGUF/FA",
        "Cost": "$250",
        "Loading Secs": "22.4secs",
        "2nd Load": "5.3secs",
        "Context (max)s": "8192",
        "Context sent": "109 tokens",
        "VRAM": "7435MB",
        "TDP": "240W",
        "watts inference": "165-185W",
        "Watts idle(Loaded)": "23W",
        "Watts idle (0B VRAM)": "15W",
        "Notes": ""
    },
    {
        "GPU": "RTX 3070 6bpw",
        "Tok/s": "31.36 tokens/s",
        "TFLOPS": "20.31 fp16",
        "Format": "EXL2",
        "Cost": "$250",
        "Loading Secs": "",
        "2nd Load": "5.17secs",
        "Context (max)s": "8192",
        "Context sent": "109 tokens",
        "VRAM": "7707MiB",
        "TDP": "240W",
        "watts inference": "140-155W",
        "Watts idle(Loaded)": "23W",
        "Watts idle (0B VRAM)": "15W",
        "Notes": ""
    },
    {
        "GPU": "RTX 3070 6bpw",
        "Tok/s": "35.27 tokens/s",
        "TFLOPS": "20.31 fp16",
        "Format": "EXL2/FA",
        "Cost": "$250",
        "Loading Secs": "17.48secs",
        "2nd Load": "5.39secs",
        "Context (max)s": "8192",
        "Context sent": "109 tokens",
        "VRAM": "7707MiB",
        "TDP": "240W",
        "watts inference": "130-145W",
        "Watts idle(Loaded)": "23W",
        "Watts idle (0B VRAM)": "15W",
        "Notes": ""
    },
    {
        "GPU": "Titan V",
        "Tok/s": "37.37 tokens/s",
        "TFLOPS": "14.90 fp32",
        "Format": "GGUF",
        "Cost": "$300",
        "Loading Secs": "23.38 sec",
        "2nd Load": "2.53secs",
        "Context (max)s": "8192",
        "Context sent": "109 tokens",
        "VRAM": "9502MB",
        "TDP": "250W",
        "watts inference": "90W-127W",
        "Watts idle(Loaded)": "25W",
        "Watts idle (0B VRAM)": "25W",
        "Notes": "--tensorcores"
    },
    {
        "GPU": "Titan V",
        "Tok/s": "45.65 tokens/s",
        "TFLOPS": "29.80 fp16",
        "Format": "EXL2",
        "Cost": "$300",
        "Loading Secs": "20.75secs",
        "2nd Load": "6.27secs",
        "Context (max)s": "8192",
        "Context sent": "109 tokens",
        "VRAM": "9422MB",
        "TDP": "250W",
        "watts inference": "110-130W",
        "Watts idle(Loaded)": "25W",
        "Watts idle (0B VRAM)": "23W",
        "Notes": "no_flash_attn=true"
    },
    {
        "GPU": "Tesla T4",
        "Tok/s": "19.57 tokens/s",
        "TFLOPS": "8.141 fp32",
        "Format": "GGUF",
        "Cost": "$500",
        "Loading Secs": "23.72secs",
        "2nd Load": "2.24secs",
        "Context (max)s": "8192",
        "Context sent": "109 tokens",
        "VRAM": "9294MB",
        "TDP": "70W",
        "watts inference": "45-50w",
        "Watts idle(Loaded)": "37W",
        "Watts idle (0B VRAM)": "10-27W",
        "Notes": "Card I had bounced between P0 & P8 idle"
    },
    {
        "GPU": "Tesla T4",
        "Tok/s": "23.99 tokens/s",
        "TFLOPS": "65.13 fp16",
        "Format": "EXL2",
        "Cost": "$500",
        "Loading Secs": "27.04secs",
        "2nd Load": "6.63secs",
        "Context (max)s": "8192",
        "Context sent": "109 tokens",
        "VRAM": "9220MB",
        "TDP": "70W",
        "watts inference": "60-70W",
        "Watts idle(Loaded)": "27W",
        "Watts idle (0B VRAM)": "10-27W",
        "Notes": ""
    },
    {
        "GPU": "Titan RTX",
        "Tok/s": "31.62 tokens/s",
        "TFLOPS": "16.31 fp32",
        "Format": "GGUF",
        "Cost": "$700",
        "Loading Secs": "",
        "2nd Load": "2.93secs",
        "Context (max)s": "8192",
        "Context sent": "109 tokens",
        "VRAM": "9358MB",
        "TDP": "280W",
        "watts inference": "180-210W",
        "Watts idle(Loaded)": "15W",
        "Watts idle (0B VRAM)": "15W",
        "Notes": "--tensorcores"
    },
    {
        "GPU": "Titan RTX",
        "Tok/s": "32.56 tokens/s",
        "TFLOPS": "16.31 fp32",
        "Format": "GGUF/FA",
        "Cost": "$700",
        "Loading Secs": "23.78secs",
        "2nd Load": "2.92secs",
        "Context (max)s": "8192",
        "Context sent": "109 tokens",
        "VRAM": "9056MB",
        "TDP": "280W",
        "watts inference": "185-215W",
        "Watts idle(Loaded)": "15W",
        "Watts idle (0B VRAM)": "15W",
        "Notes": "--tensorcores flash_attn=true"
    },
    {
        "GPU": "Titan RTX",
        "Tok/s": "44.15 tokens/s",
        "TFLOPS": "32.62 fp16",
        "Format": "EXL2",
        "Cost": "$700",
        "Loading Secs": "26.58secs",
        "2nd Load": "6.47secs",
        "Context (max)s": "8192",
        "Context sent": "109 tokens",
        "VRAM": "9246MB",
        "TDP": "280W",
        "watts inference": "220-240W",
        "Watts idle(Loaded)": "15W",
        "Watts idle (0B VRAM)": "15W",
        "Notes": "no_flash_attn=true"
    },
    {
        "GPU": "RTX 3090",
        "Tok/s": "35.13 tokens/s",
        "TFLOPS": "35.58 fp32",
        "Format": "GGUF",
        "Cost": "$700",
        "Loading Secs": "24.00secs",
        "2nd Load": "2.89secs",
        "Context (max)s": "8192",
        "Context sent": "109 tokens",
        "VRAM": "9456MB",
        "TDP": "350W",
        "watts inference": "235-260W",
        "Watts idle(Loaded)": "17W",
        "Watts idle (0B VRAM)": "6W",
        "Notes": ""
    },
    {
        "GPU": "RTX 3090",
        "Tok/s": "36.02 token/s",
        "TFLOPS": "35.58 fp32",
        "Format": "GGUF/FA",
        "Cost": "$700",
        "Loading Secs": "",
        "2nd Load": "2.82secs",
        "Context (max)s": "8192",
        "Context sent": "109 tokens",
        "VRAM": "9154MB",
        "TDP": "350W",
        "watts inference": "260-265W",
        "Watts idle(Loaded)": "17W",
        "Watts idle (0B VRAM)": "6W",
        "Notes": ""
    },
    {
        "GPU": "RTX 3090",
        "Tok/s": "49.11 tokens/s",
        "TFLOPS": "35.58 fp16",
        "Format": "EXL2",
        "Cost": "$700",
        "Loading Secs": "26.14secs",
        "2nd Load": "7.63secs",
        "Context (max)s": "8192",
        "Context sent": "109 tokens",
        "VRAM": "9360MB",
        "TDP": "350W",
        "watts inference": "270-315W",
        "Watts idle(Loaded)": "17W",
        "Watts idle (0B VRAM)": "6W",
        "Notes": ""
    },
    {
        "GPU": "RTX 3090",
        "Tok/s": "54.75 tokens/s",
        "TFLOPS": "35.58 fp16",
        "Format": "EXL2/FA",
        "Cost": "$700",
        "Loading Secs": "",
        "2nd Load": "7.37secs",
        "Context (max)s": "8192",
        "Context sent": "109 tokens",
        "VRAM": "9360MB",
        "TDP": "350W",
        "watts inference": "285-310W",
        "Watts idle(Loaded)": "17W",
        "Watts idle (0B VRAM)": "6W",
        "Notes": ""
    }
]