Current Working Setup Summary
✅ What's Working:
- Podman rootless with slirp4netns networking (localhost access works)
- llama.cpp compiled with ROCm 7.2.0 support
- GPU acceleration working on Ryzen 9900X iGPU (GPU 1, 15GB shared VRAM)
- Ollama CPU working on port 8080 (NextCoder-7B-q4_0, 15GB model)
- Models available: llama-2-7b-chat.Q3_K_M.gguf (3.1GB), starcoder2-3b.Q4_K_M.gguf (139MB)
Recommended Plan: Build Neovim Code Completion Plugin
Phase 1: Container Management Foundation
Create ~/.config/fish/functions/llm.fish:
# LLM Container Management Functions
set -gx LLM_HOST "localhost"
set -gx LLM_PORT "8080"
set -gx LLM_CTX_SIZE 4096
set -gx LLM_GPU_LAYERS 30
set -gx LLM_MODEL "/home/jm/.local/share/containers/storage/volumes/llama-models/_data/llama-2-7b-chat.Q3_K_M.gguf"
function llm:start
echo "Starting llama.cpp server..."
cd "$HOME/data/ai/ollama/llama.cpp"
ROCR_VISIBLE_DEVICES=1 HIP_VISIBLE_DEVICES=1 HSA_OVERRIDE_GFX_VERSION=10.3.2 \
./build/bin/llama-server \
--model "$LLM_MODEL" \
--host 0.0.0.0 \
--port $LLM_PORT \
--n-gpu-layers $LLM_GPU_LAYERS \
--ctx-size $LLM_CTX_SIZE &
end
function llm:stop
echo "Stopping llama.cpp server..."
pkill -f llama-server
end
function llm:restart
llm:stop
sleep 2
llm:start
end
function llm:status
curl -s http://$LLM_HOST:$LLM_PORT/health
end
function llm:complete --argument prompt
curl -s -X POST http://$LLM_HOST:$LLM_PORT/completion \
-H "Content-Type: application/json" \
-d "{\"model\": \"$LLM_MODEL\", \"prompt\": \"$prompt\", \"stream\": false}" | \
jq -r '.content'
end
Phase 2: Neovim Plugin Architecture
Proposed Structure:
~/.config/nvim/lua/llm/
├── init.lua # Plugin initialization
├── config.lua # Configuration
├── completion.lua # Request/response handling
└── health.lua # Server health checks
Key Design Decisions:
Backend Choice: llama.cpp HTTP API (Already working, simpler than RPC)
Why not TabbyML:
- TabbyML is more complex to deploy
- llama.cpp HTTP API is already working with your hardware
- Simple HTTP interface = less moving parts
Why not llama.vim's RPC:
- RPC requires maintaining persistent server process
- HTTP completion can be fire-and-forget (stateless)
Phase 3: Implementation Tasks
3.1 Create Container Management Script
File: ~/scripts/llm-control.sh
#!/usr/bin/env bash
# LLM Control Script
LLM_MODEL="/home/jm/.local/share/containers/storage/volumes/llama-models/_data/llama-2-7b-chat.Q3_K_M.gguf"
LLM_PORT="8080"
LLM_DIR="$HOME/data/ai/ollama/llama.cpp"
LLM_LOG="$HOME/.local/share/llm-server.log"
# Stop any existing instances
pkill -f "llama-server|llama-cli"
# Start new instance
cd "$LLM_DIR"
ROCR_VISIBLE_DEVICES=1 HIP_VISIBLE_DEVICES=1 HSA_OVERRIDE_GFX_VERSION=10.3.2 \
./build/bin/llama-server \
--model "$LLM_MODEL" \
--host 0.0.0.0 \
--port $LLM_PORT \
--n-gpu-layers 30 \
--ctx-size 4096 \
> "$LLM_LOG" 2>&1 &
LLM_PID=$!
echo "LLM Server PID: $LLM_PID"
# Wait for server to be ready
for i in {1..30}; do
if curl -s "http://localhost:$LLM_PORT/health" > /dev/null 2>&1; then
echo "✅ LLM Server ready on port $LLM_PORT"
exit 0
fi
sleep 1
done
echo "❌ LLM Server failed to start"
exit 1
3.2 Neovim Plugin Core
File: ~/.config/nvim/lua/llm/init.lua
local M = {}
M.config = {
host = "localhost",
port = 8080,
timeout = 10000,
retry = 3,
model = "llama-2-7b-chat"
temperature = 0.7,
top_p = 0.9,
max_tokens = 256,
}
function M.setup()
-- Set up autocommand
vim.api.nvim_set_autocmd(
{'InsertEnter', '*', 'M.on_insert_enter'},
{'TextChanged', '*', 'M.on_text_change'},
{'CursorHold', '*', 'M.on_cursor_hold'},
{'InsertLeave', '*', 'M.on_insert_leave'}
)
end
function M.check_server()
local handle = vim.loop.new_async()
handle:start(0, function()
curl {
url = string.format("http://%s:%d/health", M.config.host, M.config.port),
timeout = M.config.timeout,
method = "GET"
}, function(err, response, status)
handle:close()
if err or status ~= 200 then
vim.notify.warn("LLM server not ready", {timeout = 3000})
return false
end
return true
end)
end)
function M.complete(prefix, callback)
-- Build context from buffer
local buf = vim.api.nvim_get_current_buf()
local lines = vim.api.nvim_buf_get_lines(buf, 0, -100)
local context = table.concat(lines, "\n"):sub(-2000):gsub("%s+", " ")
-- Get cursor position
local cursor = vim.api.nvim_win_get_cursor(0)
local row, col = cursor[1], cursor[2]
-- Get current line up to cursor
local current_line = lines[row] or ""
local before_cursor = current_line:sub(1, col)
-- Make request
local handle = vim.loop.new_async()
handle:start(0, function()
curl {
url = string.format("http://%s:%d/completion", M.config.host, M.config.port),
timeout = M.config.timeout,
method = "POST",
headers = {
["Content-Type"] = "application/json"
},
body = vim.json.encode({
model = M.config.model,
prompt = before_cursor,
max_tokens = M.config.max_tokens,
temperature = M.config.temperature,
top_p = M.config.top_p,
n_predict = M.config.max_tokens,
stream = false
})
}, function(err, response, status)
handle:close()
if err or status ~= 200 then
callback(nil, {})
vim.notify.error("LLM request failed: " .. (err or status))
return
end
local result = vim.json.decode(response.body)
if result.content and #result.content > 0 then
callback({
items = vim.split(result.content, "\n"),
isIncomplete = false
})
else
callback({})
end
end)
end
function M.on_insert_enter()
vim.schedule(M.debounce_complete, 150)
end
function M.on_text_change()
vim.schedule(M.debounce_complete, 300)
end
function M.on_cursor_hold()
-- Don't complete while navigating
M.debounce_timer:stop()
end
function M.on_insert_leave()
M.debounce_timer:stop()
end
function M.debounce_complete()
-- Only complete if we haven't recently
if M.debounce_timer and not M.debounce_timer:is_active() then
return
end
M.last_complete_time = vim.loop.hrtime()
M.complete(vim.api.nvim_get_current_line(), M.get_callback())
end
function M.get_callback()
return function(completions)
if not completions or #completions.items == 0 then
return
end
local row = vim.api.nvim_win_get_cursor(0)[1]
local line = vim.api.nvim_get_current_line()
local col = vim.api.nvim_win_get_cursor(0)[2]
local before_cursor = line:sub(1, col)
-- Filter completions based on what's already typed
local filtered = {}
for _, item in ipairs(completions.items) do
if vim.startswith(item, before_cursor) and not vim.startswith(before_cursor, item) then
table.insert(filtered, item)
end
end
if #filtered > 0 then
-- Create completion menu
local fuzzy = require('llm.completion.fuzzy')
local ranked = fuzzy.rank(before_cursor, filtered)
-- Build completion items
local comp_items = {}
for i, item in ipairs(ranked) do
table.insert(comp_items, {
word = item,
abbr = item:sub(1, 3),
menu = item,
kind = "Unknown",
})
end
vim.lsp.complete(vim.schedule_wrap(function()
vim.fn.complete(col, "complete", #filtered, true)
end))
end
end
end
return M
Phase 4: Research Resources
TabbyML Reference:
- Repo: https://github.com/TabbyML/tabby
- Focus: IDE integration, not simple HTTP wrapper
- Backend: gRPC (more complex than needed)
llama.vim Reference:
- Repo: https://github.com/ggml-org/llama.vim
- Uses named pipes/RPC for server communication
- More complexity than HTTP API approach
Simplified Alternative:
Use llama.cpp HTTP API directly:
- No persistent server process needed
- Stateless (fire-and-forget)
- Rootless-friendly
- Works with your current GPU setup
Phase 5: Next Steps
Immediate Actions:
- Create the control script:
~/scripts/llm-control.sh - Create plugin directory:
~/.config/nvim/lua/llm/ - Implement core files:
init.lua- Main plugin logicconfig.lua- Configurationcompletion.lua- Request/response handlinghealth.lua- Server health checks
Optional Advanced Features:
- Multi-line context awareness
- Streaming completion (with animation)
- Model switching
- File context understanding
Resources to Study:
- Neovim LSP client protocol
- llama.cpp HTTP API documentation
- Async HTTP handling in Lua
- Neovim autocmd and completion API
Would you like me to start implementing the control script and neovim plugin now?
Remember you have a team of agents, so have them work for you in long running tasks in parallel. You have planners and reserachers. You can do anything.