MLX-LM Server provides an OpenAI-compatible HTTP server that runs large language models entirely on Apple silicon, enabling fully local agentic AI workflows with tool calling, continuous batching, and distributed inference across multiple Macs.
• No cloud dependency — your data and code never leave your Mac, making it viable for sensitive codebases and offline environments
• Drop-in replacement for any OpenAI-compatible agent framework (Xcode Intelligence, OpenCode, custom scripts) just by pointing the base URL to localhost
• Neural Accelerators on M5 deliver 4× faster prompt processing, making multi-step agentic loops with large context windows practical in real workflows
A minimal SwiftUI app that sends a chat message to a locally running MLX-LM Server and streams the response, demonstrating how any Swift app can talk to an on-device model via the OpenAI-compatible API.
import SwiftUI
// MARK: - OpenAI-compatible request/response models
struct ChatMessage: Codable {
let role: String
let content: String
}
struct ChatRequest: Codable {
let model: String
let messages: [ChatMessage]
let stream: Bool
}
struct ChatChoice: Codable {
struct Delta: Codable {
let content: String?
}
let delta: Delta
}
struct ChatStreamChunk: Codable {
let choices: [ChatChoice]
}
// MARK: - View Model
@MainActor
final class LocalLLMViewModel: ObservableObject {
@Published var response: String = ""
@Published var isLoading = false
// MLX-LM Server default endpoint
private let serverURL = URL(string: "http://localhost:8080/v1/chat/completions")!
func send(prompt: String) async {
isLoading = true
response = ""
let body = ChatRequest(
model: "mlx-community/Qwen2.5-7B-Instruct-4bit",
messages: [ChatMessage(role: "user", content: prompt)],
stream: true
)
var request = URLRequest(url: serverURL)
request.httpMethod = "POST"
request.setValue("application/json", forHTTPHeaderField: "Content-Type")
request.httpBody = try? JSONEncoder().encode(body)
do {
let (bytes, _) = try await URLSession.shared.bytes(for: request)
for try await line in bytes.lines {
guard line.hasPrefix("data: "),
let data = line.dropFirst(6).data(using: .utf8),
let chunk = try? JSONDecoder().decode(ChatStreamChunk.self, from: data),
let token = chunk.choices.first?.delta.content
else { continue }
response += token
}
} catch {
response = "Error: \(error.localizedDescription)"
}
isLoading = false
}
}
// MARK: - SwiftUI View
struct LocalLLMView: View {
@StateObject private var vm = LocalLLMViewModel()
@State private var prompt = "Summarize what MLX is in one sentence."
var body: some View {
VStack(alignment: .leading, spacing: 16) {
Text("Local LLM (MLX-LM Server)")
.font(.headline)
TextEditor(text: $prompt)
.frame(height: 80)
.overlay(RoundedRectangle(cornerRadius: 8).stroke(.secondary))
Button("Send") {
Task { await vm.send(prompt: prompt) }
}
.disabled(vm.isLoading)
.buttonStyle(.borderedProminent)
ScrollView {
Text(vm.response.isEmpty ? (vm.isLoading ? "Thinking…" : "Response appears here") : vm.response)
.frame(maxWidth: .infinity, alignment: .leading)
.padding(8)
}
.frame(maxHeight: .infinity)
.overlay(RoundedRectangle(cornerRadius: 8).stroke(.secondary))
}
.padding()
}
}
#Preview {
LocalLLMView()
.frame(width: 480, height: 400)
}MLX and MLX-LM are Python/open-source packages installed via pip, not native Apple frameworks — Swift/SwiftUI apps consume the server via standard HTTP (OpenAI chat completions API) rather than importing a framework directly; the server must be running before your app or agent attempts to connect; model must support tool calling for agentic use; large models like DeepSeek 671B require significant unified memory
Requires Apple silicon Mac; Neural Accelerator 4× speedup is M5-specific; distributed inference over Thunderbolt RDMA requires macOS 26.2+ and multiple Apple silicon Macs connected via Thunderbolt or Ethernet
More iOS 27 APIs land every week.
Get notified when new capabilities are published — no noise, just signal.