refactor: fix several memory leaks and improve UX by exposing microphone selection etc.

2026-02-17 15:57:14 +01:00
parent 7cbf2d04a9
commit 435c87803b
6 changed files with 261 additions and 75 deletions
--- a/calliope/app.py
+++ b/calliope/app.py
@@ -11,6 +11,8 @@ os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")
 # Run offline — models are downloaded during setup, no need to hit HuggingFace on every launch.
 os.environ.setdefault("HF_HUB_OFFLINE", "1")
 import subprocess
 import rumps
 from calliope import config as config_mod
@@ -36,6 +38,7 @@ class CalliopeApp(rumps.App):
        self.recorder = Recorder(device=cfg.get("device"))
        self.transcriber = Transcriber(
            model=cfg.get("model", "distil-whisper/distil-large-v3"),
            silence_threshold=cfg.get("silence_threshold", 0.005),
        )
        self.transcriber.context = cfg.get("context", "")
        self.transcriber.language = cfg.get("language", "auto")
@@ -44,11 +47,15 @@ class CalliopeApp(rumps.App):
        self._rec_lock = threading.Lock()
        self._rec_start_time: float | None = None
        self._rec_timer: rumps.Timer | None = None
        self._transcribe_done = threading.Event()
        self._transcribe_done.set()  # not transcribing initially
        self.status_item = rumps.MenuItem("Status: Loading model...")
        self.status_item.set_callback(None)
        self.toggle_item = rumps.MenuItem("Start Recording", callback=self._on_toggle_click)
-        self.context_item = rumps.MenuItem("Set Whisper Context...", callback=self._on_set_context)
+        ctx = cfg.get("context", "")
        context_label = f"Set Whisper Context... ({ctx[:20]}...)" if ctx else "Set Whisper Context..."
        self.context_item = rumps.MenuItem(context_label, callback=self._on_set_context)
        # Language submenu
        self._lang_menu = rumps.MenuItem("Language")
@@ -67,6 +74,19 @@ class CalliopeApp(rumps.App):
            item = rumps.MenuItem(f"{prefix}{short}", callback=self._on_model_select)
            self._model_menu.add(item)
        # Microphone submenu
        self._mic_menu = rumps.MenuItem("Microphone")
        self._build_mic_menu()
        # Typing mode submenu
        self._typing_menu = rumps.MenuItem("Typing Mode")
        current_mode = cfg.get("typing_mode", "char")
        for mode, label in [("char", "Character (CGEvents)"), ("clipboard", "Clipboard (Cmd+V)")]:
            prefix = "\u2713 " if mode == current_mode else "   "
            item = rumps.MenuItem(f"{prefix}{label}", callback=self._on_typing_mode_select)
            item._typing_mode = mode
            self._typing_menu.add(item)
        quit_item = rumps.MenuItem("Quit Calliope", callback=self._on_quit)
        self.menu = [
@@ -76,6 +96,8 @@ class CalliopeApp(rumps.App):
            self.context_item,
            self._lang_menu,
            self._model_menu,
            self._mic_menu,
            self._typing_menu,
            None,
            quit_item,
        ]
@@ -92,19 +114,34 @@ class CalliopeApp(rumps.App):
        # Load model in background
        threading.Thread(target=self._load_model, daemon=True).start()
    def _notify(self, title: str, subtitle: str, message: str) -> None:
        if not self.cfg.get("notifications", True):
            return
        try:
            text = f"{subtitle} — {message}" if subtitle else message
            script = f'display notification "{text}" with title "{title}"'
            subprocess.Popen(
                ["osascript", "-e", script],
                stdout=subprocess.DEVNULL,
                stderr=subprocess.DEVNULL,
            )
        except Exception:
            pass
    def _ready_status(self) -> str:
        short = self.transcriber.model.split("/")[-1]
        return f"Status: Ready ({short})"
    def _load_model(self) -> None:
        try:
            self.transcriber.load()
-            self.status_item.title = "Status: Ready"
+            self.status_item.title = self._ready_status()
            self.hotkeys.start()
            log.info("Model loaded, hotkeys active")
        except Exception:
            log.error("Failed to load model", exc_info=True)
            self.status_item.title = "Status: Model load failed"
-            try:
+            self._notify("Calliope", "Error", "Failed to load Whisper model. Check logs.")
                rumps.notification("Calliope", "Error", "Failed to load Whisper model. Check logs.")
            except RuntimeError:
                pass
    @staticmethod
    def _activate_app():
@@ -125,20 +162,23 @@ class CalliopeApp(rumps.App):
        response = rumps.Window(
            message="Provide context to help Whisper with domain-specific terms, "
            "names, or jargon. For example:\n\n"
-            "\"Meeting about Kubernetes, gRPC, and the Istio service mesh.\"",
+            "\"Meeting about Kubernetes, gRPC, and the Istio service mesh.\"\n\n"
            "Clear the field and press Save to remove context.",
            title="Set Whisper Context",
            default_text=self.transcriber.context,
            ok="Save",
-            cancel="Clear",
+            cancel="Cancel",
            dimensions=(320, 120),
        ).run()
-        if response.clicked == 1:  # Save
+        if response.clicked != 1:  # Cancel / Escape
            self._deactivate_app()
            return
        self.transcriber.context = response.text.strip()
        else:  # Clear
            self.transcriber.context = ""
        self._deactivate_app()
        ctx = self.transcriber.context
        self.context_item.title = f"Set Whisper Context... ({ctx[:20]}...)" if ctx else "Set Whisper Context..."
        self.cfg["context"] = ctx
        config_mod.save(self.cfg)
    def _on_language_select(self, sender) -> None:
        display_name = sender.title.strip().lstrip("\u2713").strip()
@@ -170,19 +210,68 @@ class CalliopeApp(rumps.App):
        config_mod.save(self.cfg)
        self.status_item.title = "Status: Loading model..."
        self.hotkeys.stop()
        def _switch():
            self._transcribe_done.wait()  # wait for in-flight transcription
            self._release_transcriber()
-        self.transcriber = Transcriber(model=model_id)
+            self.transcriber = Transcriber(
                model=model_id,
                silence_threshold=self.cfg.get("silence_threshold", 0.005),
            )
            self.transcriber.context = self.cfg.get("context", "")
            self.transcriber.language = self.cfg.get("language", "auto")
-        threading.Thread(target=self._load_model, daemon=True).start()
+            self._load_model()
        threading.Thread(target=_switch, daemon=True).start()
        log.info("Switching model to %s", model_id)
    def _build_mic_menu(self) -> None:
        """Populate the microphone submenu with available input devices."""
        import sounddevice as sd
        current_device = self.cfg.get("device")
        # System default
        prefix = "\u2713 " if current_device is None else "   "
        item = rumps.MenuItem(f"{prefix}System Default", callback=self._on_mic_select)
        self._mic_menu.add(item)
        # List input devices
        for dev in sd.query_devices():
            if dev["max_input_channels"] > 0:
                idx = dev["index"]
                name = dev["name"]
                prefix = "\u2713 " if current_device == idx else "   "
                item = rumps.MenuItem(f"{prefix}{name}", callback=self._on_mic_select)
                item._device_index = idx
                self._mic_menu.add(item)
    def _on_mic_select(self, sender) -> None:
        name = sender.title.strip().lstrip("\u2713").strip()
        device_index = getattr(sender, "_device_index", None)
        self.recorder._device = device_index
        self.cfg["device"] = device_index
        config_mod.save(self.cfg)
        # Update checkmarks
        for item in self._mic_menu.values():
            item_name = item.title.strip().lstrip("\u2713").strip()
            item.title = f"\u2713 {item_name}" if item_name == name else f"   {item_name}"
        log.info("Microphone set to %s (device=%s)", name, device_index)
    def _on_typing_mode_select(self, sender) -> None:
        mode = sender._typing_mode
        self.cfg["typing_mode"] = mode
        config_mod.save(self.cfg)
        for item in self._typing_menu.values():
            label = item.title.strip().lstrip("\u2713").strip()
            item.title = f"\u2713 {label}" if getattr(item, "_typing_mode", None) == mode else f"   {label}"
        log.info("Typing mode set to %s", mode)
    def _release_transcriber(self) -> None:
        """Free the current Whisper model to reclaim GPU memory."""
        import gc
        import torch
        if self.transcriber is not None:
            self.transcriber._pipe = None
            self.transcriber._tokenizer = None
-        import torch
+        gc.collect()
        if torch.backends.mps.is_available():
            torch.mps.empty_cache()
@@ -214,18 +303,12 @@ class CalliopeApp(rumps.App):
            self.title = "\U0001f3a4"  # 🎤
            self.toggle_item.title = "Start Recording"
            self.status_item.title = "Status: Mic error (check device)"
-            try:
+            self._notify("Calliope", "", "Microphone unavailable — check audio device")
                rumps.notification("Calliope", "", "Microphone unavailable — check audio device")
            except RuntimeError:
                pass
            return
        self.overlay.show()
        self._rec_timer = rumps.Timer(self._update_rec_duration, 1)
        self._rec_timer.start()
-        try:
+        self._notify("Calliope", "", "Recording started")
            rumps.notification("Calliope", "", "Recording started")
        except RuntimeError:
            pass  # Info.plist missing CFBundleIdentifier
        log.info("Recording started")
    def _stop_and_transcribe(self) -> None:
@@ -238,36 +321,45 @@ class CalliopeApp(rumps.App):
            self._rec_timer = None
        duration = int(time.time() - self._rec_start_time) if self._rec_start_time else 0
        self._rec_start_time = None
-        self.title = "\U0001f3a4"  # 🎤
+        self.title = "\u23f3"  # ⏳
        self.toggle_item.title = "Start Recording"
        self.status_item.title = "Status: Transcribing..."
        self.overlay.show_transcribing()
        audio = self.recorder.stop()
-        try:
+        self._notify("Calliope", "", f"Recording stopped ({duration}s)")
            rumps.notification("Calliope", "", f"Recording stopped ({duration}s)")
        except RuntimeError:
            pass
        log.info("Recording stopped, %d samples", audio.size)
        self._transcribe_done.clear()
        threading.Thread(target=self._transcribe_and_type, args=(audio,), daemon=True).start()
    def _update_rec_duration(self, timer) -> None:
        if self._rec_start_time is None:
            return
        elapsed = int(time.time() - self._rec_start_time)
        max_dur = self.cfg.get("max_recording_seconds", 300)
        if max_dur and elapsed >= max_dur:
            log.info("Max recording duration reached (%ds)", max_dur)
            self._stop_and_transcribe()
            return
        minutes, seconds = divmod(elapsed, 60)
        self.title = f"\U0001f534 {minutes}:{seconds:02d}"
    def _transcribe_and_type(self, audio) -> None:
        try:
            text = self.transcriber.transcribe(audio)
            if not text:
                self.overlay.hide()
                self.title = "\U0001f3a4"  # 🎤
                self.status_item.title = self._ready_status()
                self._notify("Calliope", "", "No speech detected — audio too short or too quiet")
                return
            if text:
                def _do_type():
                    try:
                        if self.cfg.get("typing_mode", "char") == "clipboard":
                            type_text_clipboard(text)
                        else:
-                            type_text(text)
+                            type_text(text, delay=self.cfg.get("typing_delay", 0.005))
                        print(f"\n[Calliope] {text}")
                        log.info("Typed %d chars", len(text))
                    except Exception:
@@ -275,15 +367,15 @@ class CalliopeApp(rumps.App):
                from PyObjCTools.AppHelper import callAfter
                callAfter(_do_type)
            self.overlay.hide()
-            self.status_item.title = "Status: Ready"
+            self.status_item.title = self._ready_status()
        except Exception:
            log.error("Transcription failed", exc_info=True)
            self.overlay.hide()
-            self.status_item.title = "Status: Ready"
+            self.status_item.title = self._ready_status()
-            try:
+            self._notify("Calliope", "Error", "Transcription failed. Check logs.")
-                rumps.notification("Calliope", "Error", "Transcription failed. Check logs.")
+        finally:
-            except RuntimeError:
+            self.title = "\U0001f3a4"  # 🎤
-                pass
+            self._transcribe_done.set()
    def _on_quit(self, sender) -> None:
        self.hotkeys.stop()
--- a/calliope/config.py
+++ b/calliope/config.py
@@ -22,6 +22,10 @@ DEFAULTS: dict[str, Any] = {
    "context": "",
    "debug": False,
    "typing_mode": "char",  # "char" or "clipboard"
    "max_recording_seconds": 300,  # 5 minutes
    "silence_threshold": 0.005,  # RMS energy below which audio is considered silence
    "notifications": True,  # show macOS notifications
    "typing_delay": 0.005,  # seconds between keystrokes in char mode
 }
 LANGUAGES: dict[str, str] = {
--- a/calliope/hotkeys.py
+++ b/calliope/hotkeys.py
@@ -14,21 +14,53 @@ _KEY_MAP: dict[str, keyboard.Key] = {
    "alt": keyboard.Key.alt,
    "cmd": keyboard.Key.cmd,
    "space": keyboard.Key.space,
    "tab": keyboard.Key.tab,
    "esc": keyboard.Key.esc,
    "enter": keyboard.Key.enter,
    "backspace": keyboard.Key.backspace,
    "delete": keyboard.Key.delete,
 }
 # Add function keys F1-F12
 for _i in range(1, 13):
    _KEY_MAP[f"f{_i}"] = getattr(keyboard.Key, f"f{_i}")
 # Virtual keycodes for left/right modifier normalization (macOS)
 _VK_NORMALIZE = {
    0x3B: keyboard.Key.ctrl,   # left ctrl
    0x3E: keyboard.Key.ctrl,   # right ctrl
    0x38: keyboard.Key.shift,  # left shift
    0x3C: keyboard.Key.shift,  # right shift
    0x3A: keyboard.Key.alt,    # left alt/option
    0x3D: keyboard.Key.alt,    # right alt/option
    0x37: keyboard.Key.cmd,    # left cmd
    0x36: keyboard.Key.cmd,    # right cmd
 }
-def _parse_combo(combo: str) -> set[keyboard.Key]:
+def _parse_combo(combo: str) -> set:
-    """Parse 'ctrl+shift' into a set of pynput keys."""
+    """Parse 'ctrl+shift' or 'ctrl+r' into a set of pynput keys."""
-    keys: set[keyboard.Key] = set()
+    keys: set = set()
    for part in combo.lower().split("+"):
        part = part.strip()
        if part in _KEY_MAP:
            keys.add(_KEY_MAP[part])
        elif len(part) == 1:
            keys.add(keyboard.KeyCode.from_char(part))
        else:
            log.warning("Unknown key in combo: %s", part)
    return keys
 def _check_accessibility() -> bool:
    """Check if Accessibility permission is currently granted."""
    try:
        from ApplicationServices import AXIsProcessTrusted
        return AXIsProcessTrusted()
    except Exception:
        return True  # assume granted if we can't check
 class HotkeyListener:
    def __init__(
        self,
@@ -51,6 +83,8 @@ class HotkeyListener:
        log.debug("PTT keys: %s, Toggle keys: %s", self._ptt_keys, self._toggle_keys)
    def start(self) -> None:
        if not _check_accessibility():
            log.error("Accessibility permission not granted — hotkeys will not work")
        self._pressed.clear()
        self._ptt_active = False
        self._toggle_active = False
@@ -75,21 +109,27 @@ class HotkeyListener:
    def _normalize(self, key) -> keyboard.Key | keyboard.KeyCode:
        if hasattr(key, "value") and hasattr(key.value, "vk"):
            vk = key.value.vk
-            if vk in (0x3B, 0x3E):
+            normalized = _VK_NORMALIZE.get(vk)
-                return keyboard.Key.ctrl
+            if normalized is not None:
-            if vk in (0x38, 0x3C):
+                return normalized
-                return keyboard.Key.shift
+        # Normalize character keys to lowercase
        if isinstance(key, keyboard.KeyCode) and key.char is not None:
            return keyboard.KeyCode.from_char(key.char.lower())
        return key
    def _on_press(self, key) -> None:
        key = self._normalize(key)
        self._pressed.add(key)
        # Check PTT first; if PTT fires, skip toggle to prevent double-trigger
        if self._ptt_keys.issubset(self._pressed) and not self._ptt_active:
            self._ptt_active = True
            self._on_ptt_start()
            return
        if self._toggle_keys.issubset(self._pressed) and not self._toggle_active:
            # Don't fire toggle if PTT is active
            if not self._ptt_active:
                self._toggle_active = True
                self._on_toggle()
--- a/calliope/overlay.py
+++ b/calliope/overlay.py
@@ -109,18 +109,28 @@ class WaveformView(NSView):
        if not amps:
            return
-        step = draw_w / max(len(amps) - 1, 1)
+        # Draw centered: newest sample at center, older samples outward, mirrored
        half_bars = len(amps)
        mid_x = w / 2
        step = (draw_w / 2) / max(half_bars - 1, 1)
        for sign in (1, -1):
            line = NSBezierPath.bezierPath()
            line.setLineWidth_(1.5)
            # Left half: oldest at left edge, newest at center
            for i, a in enumerate(amps):
-                x = padding + i * step
+                x = mid_x - (half_bars - 1 - i) * step
                y_off = a * draw_h * sign
                if i == 0:
                    line.moveToPoint_((x, mid_y + y_off))
                else:
                    line.lineToPoint_((x, mid_y + y_off))
            # Right half: mirror (newest at center, oldest at right edge)
            for i in range(1, half_bars):
                a = amps[half_bars - 1 - i]
                x = mid_x + i * step
                y_off = a * draw_h * sign
                line.lineToPoint_((x, mid_y + y_off))
            line.stroke()
        self._draw_label("calliope recording...")
@@ -252,8 +262,17 @@ class WaveformOverlay:
        """Switch overlay to transcribing state (pulsing dots)."""
        callAfter(self._show_transcribing_on_main)
    def _reposition_panel(self):
        """Move the panel to the top-center of the current main screen."""
        screen = NSScreen.mainScreen()
        screen_frame = screen.frame()
        x = (screen_frame.size.width - WIDTH) / 2
        y = screen_frame.size.height - HEIGHT - 40
        self._panel.setFrameOrigin_(NSMakePoint(x, y))
    def _show_on_main(self):
        self._ensure_panel()
        self._reposition_panel()
        self._view.stopFade()
        self._view.mode = OverlayMode.RECORDING
        self._view.amplitudes = deque([0.0] * NUM_BARS, maxlen=NUM_BARS)
@@ -265,6 +284,7 @@ class WaveformOverlay:
    def _show_transcribing_on_main(self):
        self._ensure_panel()
        self._reposition_panel()
        self._view.stopFade()
        self._view.mode = OverlayMode.TRANSCRIBING
        self._view._pulse_start = time.monotonic()
--- a/calliope/transcriber.py
+++ b/calliope/transcriber.py
@@ -10,12 +10,23 @@ log = logging.getLogger(__name__)
 class Transcriber:
-    def __init__(self, model: str = "distil-whisper/distil-large-v3"):
+    def __init__(self, model: str = "distil-whisper/distil-large-v3", silence_threshold: float = 0.005):
        self.model = model
        self._pipe = None
        self._tokenizer = None
-        self.context: str = ""
+        self._context: str = ""
        self._cached_prompt_ids = None
        self.language: str = "auto"
        self.silence_threshold = silence_threshold
    @property
    def context(self) -> str:
        return self._context
    @context.setter
    def context(self, value: str) -> None:
        self._context = value
        self._cached_prompt_ids = None  # invalidate cache
    def load(self) -> None:
        from transformers import AutoTokenizer
@@ -32,7 +43,12 @@ class Transcriber:
                device=device,
            )
            self._tokenizer = AutoTokenizer.from_pretrained(self.model)
-            log.info("Model loaded successfully")
+            log.info("Model loaded, running warmup...")
            self._pipe(
                {"raw": np.zeros(16_000, dtype=np.float32), "sampling_rate": 16_000},
                batch_size=1,
            )
            log.info("Model ready")
        except Exception:
            log.error("Failed to load model %s", self.model, exc_info=True)
            raise
@@ -48,18 +64,18 @@ class Transcriber:
        duration = audio.size / 16_000
        energy = float(np.sqrt(np.mean(audio ** 2)))
        log.debug("Audio: %.1fs, RMS energy: %.6f", duration, energy)
-        if duration < 1.0 or energy < 0.005:
+        if duration < 1.0 or energy < self.silence_threshold:
            log.debug("Audio too short or too quiet, skipping transcription")
            return ""
        generate_kwargs = {}
-        if self.context:
+        if self._context:
-            prompt_ids = self._tokenizer.get_prompt_ids(self.context)
+            if self._cached_prompt_ids is None:
-            generate_kwargs["prompt_ids"] = prompt_ids
+                self._cached_prompt_ids = self._tokenizer.get_prompt_ids(self._context)
            generate_kwargs["prompt_ids"] = self._cached_prompt_ids
        pipe_kwargs = {
-            "batch_size": 4,
+            "batch_size": 1,
            "return_timestamps": True,
            "generate_kwargs": generate_kwargs,
        }
        if self.language != "auto":
--- a/calliope/typer.py
+++ b/calliope/typer.py
@@ -1,7 +1,6 @@
 """Type text into the focused field using Quartz CGEvents."""
 import logging
 import subprocess
 import time
 import Quartz
@@ -9,36 +8,51 @@ import Quartz
 log = logging.getLogger(__name__)
-def type_text(text: str) -> None:
+def type_text(text: str, delay: float = 0.005) -> None:
    """Simulate typing text into the currently focused text field."""
    for char in text:
        _type_char(char)
-        time.sleep(0.005)
+        time.sleep(delay)
 def type_text_clipboard(text: str) -> None:
    """Type text by copying to clipboard and pasting with Cmd+V.
-    Saves and restores the previous clipboard contents.
+    Saves and restores the previous clipboard contents, including non-text
    data like images and files.
    """
-    # Save current clipboard
+    from AppKit import NSPasteboard, NSStringPboardType
    try:
        prev = subprocess.run(
            ["pbpaste"], capture_output=True, text=True, timeout=2,
        ).stdout
    except Exception:
        prev = None
-    # Copy text to clipboard
+    pb = NSPasteboard.generalPasteboard()
    subprocess.run(["pbcopy"], input=text, text=True, timeout=2)
-    # Paste with Cmd+V
+    # Save all current pasteboard items
    saved_items = []
    for item in pb.pasteboardItems() or []:
        item_data = {}
        for ptype in item.types():
            data = item.dataForType_(ptype)
            if data is not None:
                item_data[ptype] = data
        if item_data:
            saved_items.append(item_data)
    # Set our text and paste
    pb.clearContents()
    pb.setString_forType_(text, NSStringPboardType)
    _cmd_v()
    time.sleep(0.05)
-    # Restore previous clipboard
+    # Restore previous clipboard contents
-    if prev is not None:
+    if saved_items:
-        subprocess.run(["pbcopy"], input=prev, text=True, timeout=2)
+        from AppKit import NSPasteboardItem
        pb.clearContents()
        new_items = []
        for item_data in saved_items:
            item = NSPasteboardItem.alloc().init()
            for ptype, data in item_data.items():
                item.setData_forType_(data, ptype)
            new_items.append(item)
        pb.writeObjects_(new_items)
 def _cmd_v() -> None: