refactor: fix several memory leaks and improve UX by exposing microphone selection etc.

This commit is contained in:
syntaxbullet
2026-02-17 15:57:14 +01:00
parent 7cbf2d04a9
commit 435c87803b
6 changed files with 261 additions and 75 deletions

View File

@@ -11,6 +11,8 @@ os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")
# Run offline — models are downloaded during setup, no need to hit HuggingFace on every launch.
os.environ.setdefault("HF_HUB_OFFLINE", "1")
import subprocess
import rumps
from calliope import config as config_mod
@@ -36,6 +38,7 @@ class CalliopeApp(rumps.App):
self.recorder = Recorder(device=cfg.get("device"))
self.transcriber = Transcriber(
model=cfg.get("model", "distil-whisper/distil-large-v3"),
silence_threshold=cfg.get("silence_threshold", 0.005),
)
self.transcriber.context = cfg.get("context", "")
self.transcriber.language = cfg.get("language", "auto")
@@ -44,11 +47,15 @@ class CalliopeApp(rumps.App):
self._rec_lock = threading.Lock()
self._rec_start_time: float | None = None
self._rec_timer: rumps.Timer | None = None
self._transcribe_done = threading.Event()
self._transcribe_done.set() # not transcribing initially
self.status_item = rumps.MenuItem("Status: Loading model...")
self.status_item.set_callback(None)
self.toggle_item = rumps.MenuItem("Start Recording", callback=self._on_toggle_click)
self.context_item = rumps.MenuItem("Set Whisper Context...", callback=self._on_set_context)
ctx = cfg.get("context", "")
context_label = f"Set Whisper Context... ({ctx[:20]}...)" if ctx else "Set Whisper Context..."
self.context_item = rumps.MenuItem(context_label, callback=self._on_set_context)
# Language submenu
self._lang_menu = rumps.MenuItem("Language")
@@ -67,6 +74,19 @@ class CalliopeApp(rumps.App):
item = rumps.MenuItem(f"{prefix}{short}", callback=self._on_model_select)
self._model_menu.add(item)
# Microphone submenu
self._mic_menu = rumps.MenuItem("Microphone")
self._build_mic_menu()
# Typing mode submenu
self._typing_menu = rumps.MenuItem("Typing Mode")
current_mode = cfg.get("typing_mode", "char")
for mode, label in [("char", "Character (CGEvents)"), ("clipboard", "Clipboard (Cmd+V)")]:
prefix = "\u2713 " if mode == current_mode else " "
item = rumps.MenuItem(f"{prefix}{label}", callback=self._on_typing_mode_select)
item._typing_mode = mode
self._typing_menu.add(item)
quit_item = rumps.MenuItem("Quit Calliope", callback=self._on_quit)
self.menu = [
@@ -76,6 +96,8 @@ class CalliopeApp(rumps.App):
self.context_item,
self._lang_menu,
self._model_menu,
self._mic_menu,
self._typing_menu,
None,
quit_item,
]
@@ -92,19 +114,34 @@ class CalliopeApp(rumps.App):
# Load model in background
threading.Thread(target=self._load_model, daemon=True).start()
def _notify(self, title: str, subtitle: str, message: str) -> None:
if not self.cfg.get("notifications", True):
return
try:
text = f"{subtitle}{message}" if subtitle else message
script = f'display notification "{text}" with title "{title}"'
subprocess.Popen(
["osascript", "-e", script],
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
)
except Exception:
pass
def _ready_status(self) -> str:
short = self.transcriber.model.split("/")[-1]
return f"Status: Ready ({short})"
def _load_model(self) -> None:
try:
self.transcriber.load()
self.status_item.title = "Status: Ready"
self.status_item.title = self._ready_status()
self.hotkeys.start()
log.info("Model loaded, hotkeys active")
except Exception:
log.error("Failed to load model", exc_info=True)
self.status_item.title = "Status: Model load failed"
try:
rumps.notification("Calliope", "Error", "Failed to load Whisper model. Check logs.")
except RuntimeError:
pass
self._notify("Calliope", "Error", "Failed to load Whisper model. Check logs.")
@staticmethod
def _activate_app():
@@ -125,20 +162,23 @@ class CalliopeApp(rumps.App):
response = rumps.Window(
message="Provide context to help Whisper with domain-specific terms, "
"names, or jargon. For example:\n\n"
"\"Meeting about Kubernetes, gRPC, and the Istio service mesh.\"",
"\"Meeting about Kubernetes, gRPC, and the Istio service mesh.\"\n\n"
"Clear the field and press Save to remove context.",
title="Set Whisper Context",
default_text=self.transcriber.context,
ok="Save",
cancel="Clear",
cancel="Cancel",
dimensions=(320, 120),
).run()
if response.clicked == 1: # Save
self.transcriber.context = response.text.strip()
else: # Clear
self.transcriber.context = ""
if response.clicked != 1: # Cancel / Escape
self._deactivate_app()
return
self.transcriber.context = response.text.strip()
self._deactivate_app()
ctx = self.transcriber.context
self.context_item.title = f"Set Whisper Context... ({ctx[:20]}...)" if ctx else "Set Whisper Context..."
self.cfg["context"] = ctx
config_mod.save(self.cfg)
def _on_language_select(self, sender) -> None:
display_name = sender.title.strip().lstrip("\u2713").strip()
@@ -170,19 +210,68 @@ class CalliopeApp(rumps.App):
config_mod.save(self.cfg)
self.status_item.title = "Status: Loading model..."
self.hotkeys.stop()
self._release_transcriber()
self.transcriber = Transcriber(model=model_id)
self.transcriber.context = self.cfg.get("context", "")
self.transcriber.language = self.cfg.get("language", "auto")
threading.Thread(target=self._load_model, daemon=True).start()
def _switch():
self._transcribe_done.wait() # wait for in-flight transcription
self._release_transcriber()
self.transcriber = Transcriber(
model=model_id,
silence_threshold=self.cfg.get("silence_threshold", 0.005),
)
self.transcriber.context = self.cfg.get("context", "")
self.transcriber.language = self.cfg.get("language", "auto")
self._load_model()
threading.Thread(target=_switch, daemon=True).start()
log.info("Switching model to %s", model_id)
def _build_mic_menu(self) -> None:
"""Populate the microphone submenu with available input devices."""
import sounddevice as sd
current_device = self.cfg.get("device")
# System default
prefix = "\u2713 " if current_device is None else " "
item = rumps.MenuItem(f"{prefix}System Default", callback=self._on_mic_select)
self._mic_menu.add(item)
# List input devices
for dev in sd.query_devices():
if dev["max_input_channels"] > 0:
idx = dev["index"]
name = dev["name"]
prefix = "\u2713 " if current_device == idx else " "
item = rumps.MenuItem(f"{prefix}{name}", callback=self._on_mic_select)
item._device_index = idx
self._mic_menu.add(item)
def _on_mic_select(self, sender) -> None:
name = sender.title.strip().lstrip("\u2713").strip()
device_index = getattr(sender, "_device_index", None)
self.recorder._device = device_index
self.cfg["device"] = device_index
config_mod.save(self.cfg)
# Update checkmarks
for item in self._mic_menu.values():
item_name = item.title.strip().lstrip("\u2713").strip()
item.title = f"\u2713 {item_name}" if item_name == name else f" {item_name}"
log.info("Microphone set to %s (device=%s)", name, device_index)
def _on_typing_mode_select(self, sender) -> None:
mode = sender._typing_mode
self.cfg["typing_mode"] = mode
config_mod.save(self.cfg)
for item in self._typing_menu.values():
label = item.title.strip().lstrip("\u2713").strip()
item.title = f"\u2713 {label}" if getattr(item, "_typing_mode", None) == mode else f" {label}"
log.info("Typing mode set to %s", mode)
def _release_transcriber(self) -> None:
"""Free the current Whisper model to reclaim GPU memory."""
import gc
import torch
if self.transcriber is not None:
self.transcriber._pipe = None
self.transcriber._tokenizer = None
import torch
gc.collect()
if torch.backends.mps.is_available():
torch.mps.empty_cache()
@@ -214,18 +303,12 @@ class CalliopeApp(rumps.App):
self.title = "\U0001f3a4" # 🎤
self.toggle_item.title = "Start Recording"
self.status_item.title = "Status: Mic error (check device)"
try:
rumps.notification("Calliope", "", "Microphone unavailable — check audio device")
except RuntimeError:
pass
self._notify("Calliope", "", "Microphone unavailable — check audio device")
return
self.overlay.show()
self._rec_timer = rumps.Timer(self._update_rec_duration, 1)
self._rec_timer.start()
try:
rumps.notification("Calliope", "", "Recording started")
except RuntimeError:
pass # Info.plist missing CFBundleIdentifier
self._notify("Calliope", "", "Recording started")
log.info("Recording started")
def _stop_and_transcribe(self) -> None:
@@ -238,36 +321,45 @@ class CalliopeApp(rumps.App):
self._rec_timer = None
duration = int(time.time() - self._rec_start_time) if self._rec_start_time else 0
self._rec_start_time = None
self.title = "\U0001f3a4" # 🎤
self.title = "\u23f3" #
self.toggle_item.title = "Start Recording"
self.status_item.title = "Status: Transcribing..."
self.overlay.show_transcribing()
audio = self.recorder.stop()
try:
rumps.notification("Calliope", "", f"Recording stopped ({duration}s)")
except RuntimeError:
pass
self._notify("Calliope", "", f"Recording stopped ({duration}s)")
log.info("Recording stopped, %d samples", audio.size)
self._transcribe_done.clear()
threading.Thread(target=self._transcribe_and_type, args=(audio,), daemon=True).start()
def _update_rec_duration(self, timer) -> None:
if self._rec_start_time is None:
return
elapsed = int(time.time() - self._rec_start_time)
max_dur = self.cfg.get("max_recording_seconds", 300)
if max_dur and elapsed >= max_dur:
log.info("Max recording duration reached (%ds)", max_dur)
self._stop_and_transcribe()
return
minutes, seconds = divmod(elapsed, 60)
self.title = f"\U0001f534 {minutes}:{seconds:02d}"
def _transcribe_and_type(self, audio) -> None:
try:
text = self.transcriber.transcribe(audio)
if not text:
self.overlay.hide()
self.title = "\U0001f3a4" # 🎤
self.status_item.title = self._ready_status()
self._notify("Calliope", "", "No speech detected — audio too short or too quiet")
return
if text:
def _do_type():
try:
if self.cfg.get("typing_mode", "char") == "clipboard":
type_text_clipboard(text)
else:
type_text(text)
type_text(text, delay=self.cfg.get("typing_delay", 0.005))
print(f"\n[Calliope] {text}")
log.info("Typed %d chars", len(text))
except Exception:
@@ -275,15 +367,15 @@ class CalliopeApp(rumps.App):
from PyObjCTools.AppHelper import callAfter
callAfter(_do_type)
self.overlay.hide()
self.status_item.title = "Status: Ready"
self.status_item.title = self._ready_status()
except Exception:
log.error("Transcription failed", exc_info=True)
self.overlay.hide()
self.status_item.title = "Status: Ready"
try:
rumps.notification("Calliope", "Error", "Transcription failed. Check logs.")
except RuntimeError:
pass
self.status_item.title = self._ready_status()
self._notify("Calliope", "Error", "Transcription failed. Check logs.")
finally:
self.title = "\U0001f3a4" # 🎤
self._transcribe_done.set()
def _on_quit(self, sender) -> None:
self.hotkeys.stop()

View File

@@ -22,6 +22,10 @@ DEFAULTS: dict[str, Any] = {
"context": "",
"debug": False,
"typing_mode": "char", # "char" or "clipboard"
"max_recording_seconds": 300, # 5 minutes
"silence_threshold": 0.005, # RMS energy below which audio is considered silence
"notifications": True, # show macOS notifications
"typing_delay": 0.005, # seconds between keystrokes in char mode
}
LANGUAGES: dict[str, str] = {

View File

@@ -14,21 +14,53 @@ _KEY_MAP: dict[str, keyboard.Key] = {
"alt": keyboard.Key.alt,
"cmd": keyboard.Key.cmd,
"space": keyboard.Key.space,
"tab": keyboard.Key.tab,
"esc": keyboard.Key.esc,
"enter": keyboard.Key.enter,
"backspace": keyboard.Key.backspace,
"delete": keyboard.Key.delete,
}
# Add function keys F1-F12
for _i in range(1, 13):
_KEY_MAP[f"f{_i}"] = getattr(keyboard.Key, f"f{_i}")
# Virtual keycodes for left/right modifier normalization (macOS)
_VK_NORMALIZE = {
0x3B: keyboard.Key.ctrl, # left ctrl
0x3E: keyboard.Key.ctrl, # right ctrl
0x38: keyboard.Key.shift, # left shift
0x3C: keyboard.Key.shift, # right shift
0x3A: keyboard.Key.alt, # left alt/option
0x3D: keyboard.Key.alt, # right alt/option
0x37: keyboard.Key.cmd, # left cmd
0x36: keyboard.Key.cmd, # right cmd
}
def _parse_combo(combo: str) -> set[keyboard.Key]:
"""Parse 'ctrl+shift' into a set of pynput keys."""
keys: set[keyboard.Key] = set()
def _parse_combo(combo: str) -> set:
"""Parse 'ctrl+shift' or 'ctrl+r' into a set of pynput keys."""
keys: set = set()
for part in combo.lower().split("+"):
part = part.strip()
if part in _KEY_MAP:
keys.add(_KEY_MAP[part])
elif len(part) == 1:
keys.add(keyboard.KeyCode.from_char(part))
else:
log.warning("Unknown key in combo: %s", part)
return keys
def _check_accessibility() -> bool:
"""Check if Accessibility permission is currently granted."""
try:
from ApplicationServices import AXIsProcessTrusted
return AXIsProcessTrusted()
except Exception:
return True # assume granted if we can't check
class HotkeyListener:
def __init__(
self,
@@ -51,6 +83,8 @@ class HotkeyListener:
log.debug("PTT keys: %s, Toggle keys: %s", self._ptt_keys, self._toggle_keys)
def start(self) -> None:
if not _check_accessibility():
log.error("Accessibility permission not granted — hotkeys will not work")
self._pressed.clear()
self._ptt_active = False
self._toggle_active = False
@@ -75,23 +109,29 @@ class HotkeyListener:
def _normalize(self, key) -> keyboard.Key | keyboard.KeyCode:
if hasattr(key, "value") and hasattr(key.value, "vk"):
vk = key.value.vk
if vk in (0x3B, 0x3E):
return keyboard.Key.ctrl
if vk in (0x38, 0x3C):
return keyboard.Key.shift
normalized = _VK_NORMALIZE.get(vk)
if normalized is not None:
return normalized
# Normalize character keys to lowercase
if isinstance(key, keyboard.KeyCode) and key.char is not None:
return keyboard.KeyCode.from_char(key.char.lower())
return key
def _on_press(self, key) -> None:
key = self._normalize(key)
self._pressed.add(key)
# Check PTT first; if PTT fires, skip toggle to prevent double-trigger
if self._ptt_keys.issubset(self._pressed) and not self._ptt_active:
self._ptt_active = True
self._on_ptt_start()
return
if self._toggle_keys.issubset(self._pressed) and not self._toggle_active:
self._toggle_active = True
self._on_toggle()
# Don't fire toggle if PTT is active
if not self._ptt_active:
self._toggle_active = True
self._on_toggle()
def _on_release(self, key) -> None:
key = self._normalize(key)

View File

@@ -109,18 +109,28 @@ class WaveformView(NSView):
if not amps:
return
step = draw_w / max(len(amps) - 1, 1)
# Draw centered: newest sample at center, older samples outward, mirrored
half_bars = len(amps)
mid_x = w / 2
step = (draw_w / 2) / max(half_bars - 1, 1)
for sign in (1, -1):
line = NSBezierPath.bezierPath()
line.setLineWidth_(1.5)
# Left half: oldest at left edge, newest at center
for i, a in enumerate(amps):
x = padding + i * step
x = mid_x - (half_bars - 1 - i) * step
y_off = a * draw_h * sign
if i == 0:
line.moveToPoint_((x, mid_y + y_off))
else:
line.lineToPoint_((x, mid_y + y_off))
# Right half: mirror (newest at center, oldest at right edge)
for i in range(1, half_bars):
a = amps[half_bars - 1 - i]
x = mid_x + i * step
y_off = a * draw_h * sign
line.lineToPoint_((x, mid_y + y_off))
line.stroke()
self._draw_label("calliope recording...")
@@ -252,8 +262,17 @@ class WaveformOverlay:
"""Switch overlay to transcribing state (pulsing dots)."""
callAfter(self._show_transcribing_on_main)
def _reposition_panel(self):
"""Move the panel to the top-center of the current main screen."""
screen = NSScreen.mainScreen()
screen_frame = screen.frame()
x = (screen_frame.size.width - WIDTH) / 2
y = screen_frame.size.height - HEIGHT - 40
self._panel.setFrameOrigin_(NSMakePoint(x, y))
def _show_on_main(self):
self._ensure_panel()
self._reposition_panel()
self._view.stopFade()
self._view.mode = OverlayMode.RECORDING
self._view.amplitudes = deque([0.0] * NUM_BARS, maxlen=NUM_BARS)
@@ -265,6 +284,7 @@ class WaveformOverlay:
def _show_transcribing_on_main(self):
self._ensure_panel()
self._reposition_panel()
self._view.stopFade()
self._view.mode = OverlayMode.TRANSCRIBING
self._view._pulse_start = time.monotonic()

View File

@@ -10,12 +10,23 @@ log = logging.getLogger(__name__)
class Transcriber:
def __init__(self, model: str = "distil-whisper/distil-large-v3"):
def __init__(self, model: str = "distil-whisper/distil-large-v3", silence_threshold: float = 0.005):
self.model = model
self._pipe = None
self._tokenizer = None
self.context: str = ""
self._context: str = ""
self._cached_prompt_ids = None
self.language: str = "auto"
self.silence_threshold = silence_threshold
@property
def context(self) -> str:
return self._context
@context.setter
def context(self, value: str) -> None:
self._context = value
self._cached_prompt_ids = None # invalidate cache
def load(self) -> None:
from transformers import AutoTokenizer
@@ -32,7 +43,12 @@ class Transcriber:
device=device,
)
self._tokenizer = AutoTokenizer.from_pretrained(self.model)
log.info("Model loaded successfully")
log.info("Model loaded, running warmup...")
self._pipe(
{"raw": np.zeros(16_000, dtype=np.float32), "sampling_rate": 16_000},
batch_size=1,
)
log.info("Model ready")
except Exception:
log.error("Failed to load model %s", self.model, exc_info=True)
raise
@@ -48,18 +64,18 @@ class Transcriber:
duration = audio.size / 16_000
energy = float(np.sqrt(np.mean(audio ** 2)))
log.debug("Audio: %.1fs, RMS energy: %.6f", duration, energy)
if duration < 1.0 or energy < 0.005:
if duration < 1.0 or energy < self.silence_threshold:
log.debug("Audio too short or too quiet, skipping transcription")
return ""
generate_kwargs = {}
if self.context:
prompt_ids = self._tokenizer.get_prompt_ids(self.context)
generate_kwargs["prompt_ids"] = prompt_ids
if self._context:
if self._cached_prompt_ids is None:
self._cached_prompt_ids = self._tokenizer.get_prompt_ids(self._context)
generate_kwargs["prompt_ids"] = self._cached_prompt_ids
pipe_kwargs = {
"batch_size": 4,
"return_timestamps": True,
"batch_size": 1,
"generate_kwargs": generate_kwargs,
}
if self.language != "auto":

View File

@@ -1,7 +1,6 @@
"""Type text into the focused field using Quartz CGEvents."""
import logging
import subprocess
import time
import Quartz
@@ -9,36 +8,51 @@ import Quartz
log = logging.getLogger(__name__)
def type_text(text: str) -> None:
def type_text(text: str, delay: float = 0.005) -> None:
"""Simulate typing text into the currently focused text field."""
for char in text:
_type_char(char)
time.sleep(0.005)
time.sleep(delay)
def type_text_clipboard(text: str) -> None:
"""Type text by copying to clipboard and pasting with Cmd+V.
Saves and restores the previous clipboard contents.
Saves and restores the previous clipboard contents, including non-text
data like images and files.
"""
# Save current clipboard
try:
prev = subprocess.run(
["pbpaste"], capture_output=True, text=True, timeout=2,
).stdout
except Exception:
prev = None
from AppKit import NSPasteboard, NSStringPboardType
# Copy text to clipboard
subprocess.run(["pbcopy"], input=text, text=True, timeout=2)
pb = NSPasteboard.generalPasteboard()
# Paste with Cmd+V
# Save all current pasteboard items
saved_items = []
for item in pb.pasteboardItems() or []:
item_data = {}
for ptype in item.types():
data = item.dataForType_(ptype)
if data is not None:
item_data[ptype] = data
if item_data:
saved_items.append(item_data)
# Set our text and paste
pb.clearContents()
pb.setString_forType_(text, NSStringPboardType)
_cmd_v()
time.sleep(0.05)
# Restore previous clipboard
if prev is not None:
subprocess.run(["pbcopy"], input=prev, text=True, timeout=2)
# Restore previous clipboard contents
if saved_items:
from AppKit import NSPasteboardItem
pb.clearContents()
new_items = []
for item_data in saved_items:
item = NSPasteboardItem.alloc().init()
for ptype, data in item_data.items():
item.setData_forType_(data, ptype)
new_items.append(item)
pb.writeObjects_(new_items)
def _cmd_v() -> None: