diff --git a/environments/environment-Linux.yml b/environments/environment-Linux.yml index db722fe28..b0a077bc7 100644 --- a/environments/environment-Linux.yml +++ b/environments/environment-Linux.yml @@ -23,3 +23,4 @@ dependencies: - nwbinspector == 0.6.5 - tables - numcodecs == 0.15.1 # numcodecs 0.16.0 is not compatible with zarr 2.18.5 + - claude-agent-sdk >= 0.1.0 # AI conversion assistant diff --git a/environments/environment-MAC-apple-silicon.yml b/environments/environment-MAC-apple-silicon.yml index 2147aebdb..5dbe560ee 100644 --- a/environments/environment-MAC-apple-silicon.yml +++ b/environments/environment-MAC-apple-silicon.yml @@ -29,3 +29,4 @@ dependencies: - ndx-pose == 0.2.2 - nwbinspector == 0.6.5 - numcodecs == 0.15.1 # numcodecs 0.16.0 is not compatible with zarr 2.18.5 + - claude-agent-sdk >= 0.1.0 # AI conversion assistant diff --git a/environments/environment-MAC-intel.yml b/environments/environment-MAC-intel.yml index 19f301be4..620d95324 100644 --- a/environments/environment-MAC-intel.yml +++ b/environments/environment-MAC-intel.yml @@ -28,3 +28,4 @@ dependencies: # with tables==3.9.1 (latest that can be used by neuroconv 0.6.0). # h5py and tables need to be consistent for electron build for unknown reason - ruamel.yaml.clib != 0.2.13 # 0.2.13 throws a build error on intel Mac -- see https://github.com/catalystneuro/roiextractors/issues/489 + - claude-agent-sdk >= 0.1.0 # AI conversion assistant diff --git a/environments/environment-Windows.yml b/environments/environment-Windows.yml index 3662a2534..d1da96d49 100644 --- a/environments/environment-Windows.yml +++ b/environments/environment-Windows.yml @@ -25,3 +25,4 @@ dependencies: - nwbinspector == 0.6.5 - tables - numcodecs == 0.15.1 # numcodecs 0.16.0 is not compatible with zarr 2.18.5 + - claude-agent-sdk >= 0.1.0 # AI conversion assistant diff --git a/nwb-guide.spec b/nwb-guide.spec index cd596347d..514ee0fd2 100644 --- a/nwb-guide.spec +++ b/nwb-guide.spec @@ -10,7 +10,11 @@ import scipy from PyInstaller.utils.hooks import collect_data_files from PyInstaller.utils.hooks import collect_all -datas = [('./src/paths.config.json', '.'), ('./package.json', '.')] +datas = [ + ('./src/paths.config.json', '.'), + ('./package.json', '.'), + ('./src/pyflask/ai/skill', 'ai/skill'), # Bundled NWB conversion skill +] binaries = [] hiddenimports = [ 'email_validator', @@ -24,6 +28,7 @@ datas += collect_data_files('jsonschema_specifications') # Various consequences of lazy imports modules_to_collect = [ + 'claude_agent_sdk', 'dandi', 'keyrings', 'unittest', diff --git a/package-lock.json b/package-lock.json index 6fb13db91..ea13ffd5f 100644 --- a/package-lock.json +++ b/package-lock.json @@ -24,6 +24,7 @@ "jsonschema": "^1.4.1", "lit": "^2.6.1", "lottie-web": "^5.9.5", + "marked": "^17.0.1", "notyf": "^3.9.0", "sweetalert2": "^11.6.13", "tippy.js": "^6.3.7", @@ -16411,6 +16412,18 @@ "react": ">= 0.14.0" } }, + "node_modules/marked": { + "version": "17.0.1", + "resolved": "https://registry.npmjs.org/marked/-/marked-17.0.1.tgz", + "integrity": "sha512-boeBdiS0ghpWcSwoNm/jJBwdpFaMnZWRzjA6SkUMYb40SVaN1x7mmfGKp0jvexGcx+7y2La5zRZsYFZI6Qpypg==", + "license": "MIT", + "bin": { + "marked": "bin/marked.js" + }, + "engines": { + "node": ">= 20" + } + }, "node_modules/matchdep": { "version": "2.0.0", "resolved": "https://registry.npmjs.org/matchdep/-/matchdep-2.0.0.tgz", diff --git a/package.json b/package.json index f4d7e863c..56a642ef5 100644 --- a/package.json +++ b/package.json @@ -158,6 +158,7 @@ "jsonschema": "^1.4.1", "lit": "^2.6.1", "lottie-web": "^5.9.5", + "marked": "^17.0.1", "notyf": "^3.9.0", "sweetalert2": "^11.6.13", "tippy.js": "^6.3.7", diff --git a/src/electron/frontend/core/components/pages/ai-assistant/AIAssistantPage.js b/src/electron/frontend/core/components/pages/ai-assistant/AIAssistantPage.js new file mode 100644 index 000000000..7a926e7d4 --- /dev/null +++ b/src/electron/frontend/core/components/pages/ai-assistant/AIAssistantPage.js @@ -0,0 +1,1437 @@ +import { html, css } from "lit"; +import { Page } from "../Page.js"; +import { baseUrl } from "../../../server/globals"; + +import "./ChatMessage.js"; +import "./ChatInput.js"; +import "./SettingsPanel.js"; + +/** + * AI Assistant page — chat interface for the NWB conversion agent. + * + * Two views: + * 1. Session list (home) — shows previous chats + "New Conversation" button + * 2. Chat view — active conversation with message list + input + * + * Communicates with the Flask /ai namespace via: + * - GET /ai/sessions (list saved sessions) + * - POST /ai/sessions (create session) + * - GET /ai/sessions/ (get session state or history) + * - POST /ai/sessions//message (send message) + * - GET /ai/sessions//events (SSE stream) + * - DELETE /ai/sessions/ (delete session) + */ +export class AIAssistantPage extends Page { + static properties = { + ...super.properties, + messages: { type: Array, state: true }, + sessionId: { type: String, state: true }, + dataDirs: { type: Array, state: true }, + isStreaming: { type: Boolean, state: true }, + settingsOpen: { type: Boolean, state: true }, + connected: { type: Boolean, state: true }, + savedSessions: { type: Array, state: true }, + viewMode: { type: String, state: true }, // "list" or "chat" + isReadOnly: { type: Boolean, state: true }, + authMode: { type: String, state: true }, + currentPhase: { type: Number, state: true }, + todos: { type: Array, state: true }, + }; + + header = { + title: "AI Assistant", + subtitle: "Convert your data to NWB format with AI guidance.", + }; + + constructor(...args) { + super(...args); + this.messages = []; + this.sessionId = null; + this.dataDirs = []; + this._dirInput = ""; + this.isStreaming = false; + this.settingsOpen = false; + this.connected = false; + this.savedSessions = []; + this.viewMode = "list"; + this.isReadOnly = false; + this.authMode = null; + this.currentPhase = 0; + this.todos = []; + this._eventSource = null; + this._starting = false; + this._todoIdMap = new Map(); // TodoWrite id -> text + + this.style.height = "100%"; + } + + createRenderRoot() { + return this; + } + + connectedCallback() { + super.connectedCallback(); + this._loadSessions(); + } + + disconnectedCallback() { + super.disconnectedCallback(); + this._closeEventSource(); + } + + async _loadSessions() { + try { + const resp = await fetch(new URL("/ai/sessions", baseUrl)); + if (resp.ok) { + const data = await resp.json(); + this.savedSessions = data.sessions || []; + } + } catch { + // ignore — sessions list is optional + } + } + + render() { + if (this.viewMode === "list") { + return this._renderSessionList(); + } + return this._renderChatView(); + } + + // ── Session List View ────────────────────────────────────────────── + + _renderSessionList() { + return html` + + +
+ + +
+

Conversations

+
+ + +
+
+ +
+ ${this.savedSessions.length === 0 + ? html` +
+

NWB Conversion Assistant

+

+ I'll help you convert your neurophysiology data to NWB format and publish it on + DANDI Archive. +

+

Click + New Conversation to get started.

+
+ ` + : this.savedSessions.map( + (s) => html` +
this._viewSession(s.session_id)}> +
${s.message_count > 0 ? "..." : ""}
+
+
${s.title}
+
+ ${this._formatDate(s.updated_at)} · ${s.message_count} messages + · + ${(s.data_dirs || [s.data_dir]).map((d) => this._shortDir(d)).join(", ")} +
+
+
+ +
+
+ ` + )} +
+
+ `; + } + + // ── Chat View ────────────────────────────────────────────────────── + + _renderChatView() { + const PHASES = [ + "Experiment Discovery", + "Data Inspection", + "Metadata Collection", + "Synchronization", + "Code Generation", + "Testing & Validation", + "DANDI Upload", + ]; + + return html` + + +
+ + + + +
+ + + ${this.isReadOnly + ? "" + : !this.connected + ? html` + + { + this._dirInput = e.target.value; + this.requestUpdate(); + }} + @keydown=${(e) => { + if (e.key === "Enter") { + e.preventDefault(); + this._addFolder(); + } + }} + placeholder="/path/to/your/data" + /> + + + + ` + : html`Connected ${this.authMode + ? html`${this.authMode === "proxy" + ? "Free Credits" + : this.authMode === "subscription" + ? "Your Anthropic Account" + : "Your API Key"}` + : ""}`} + ${this.connected ? html`` : ""} + +
+ + ${this.isReadOnly + ? html` +
+ Viewing saved conversation (read-only) +
+ ` + : ""} + ${this.dataDirs.length > 0 && !this.isReadOnly + ? html` +
+ ${this.dataDirs.map( + (dir, i) => html` + + ${this._shortDir(dir)} + ${!this.connected + ? html`` + : ""} + + ` + )} +
+ ` + : ""} + ${!this.connected && !this.isReadOnly + ? html` + + ` + : ""} + + +
+ +
+
+ ${this.messages.length === 0 && !this.connected && !this.isReadOnly + ? html` +
+

NWB Conversion Assistant

+

Select your data folder above and click Start to begin.

+
+ ` + : ""} + ${this.messages.map( + (msg) => + html`` + )} +
+ + ${!this.isReadOnly + ? html` +
+
+ ${this.isStreaming + ? html`
` + : ""} + + ${this.isStreaming + ? html`` + : ""} +
+
+ ` + : ""} +
+ + +
+

Progress

+
    + ${PHASES.map((name, i) => { + const num = i + 1; + const status = + num < this.currentPhase ? "completed" : num === this.currentPhase ? "active" : ""; + const phaseTodos = this.todos.filter((t) => t.phase === num); + return html` +
  • + ${status === "completed" ? "\u2713" : num} + ${name} +
  • + ${phaseTodos.length > 0 + ? html` +
    + ${phaseTodos.map( + (t) => html` +
    + ${t.done ? "\u2611" : "\u2610"} + ${t.text} +
    + ` + )} +
    + ` + : ""} + `; + })} +
+ + ${this.todos.filter((t) => !t.phase).length > 0 + ? html` +
+

Other Items

+ ${this.todos + .filter((t) => !t.phase) + .map( + (t) => html` +
+ ${t.done ? "\u2611" : "\u2610"} + ${t.text} +
+ ` + )} +
+ ` + : ""} +
+
+
+ `; + } + + _sharedStyles() { + return css``; + } + + // ── Actions ──────────────────────────────────────────────────────── + + _showNewChat() { + this.messages = []; + this.sessionId = null; + this.dataDirs = []; + this._dirInput = ""; + this.connected = false; + this.isStreaming = false; + this.isReadOnly = false; + this.authMode = null; + this.currentPhase = 0; + this.todos = []; + this._starting = false; + this._todoIdMap = new Map(); + this.viewMode = "chat"; + } + + async _viewSession(sessionId) { + try { + const resp = await fetch(new URL(`/ai/sessions/${sessionId}`, baseUrl)); + if (!resp.ok) return; + + const data = await resp.json(); + const dirs = data.data_dirs || (data.data_dir ? [data.data_dir] : []); + if (data.connected) { + // This is an active session — reconnect to it + this.sessionId = sessionId; + this.dataDirs = dirs; + this.connected = true; + this.isReadOnly = false; + this.authMode = data.auth_mode || null; + this.messages = []; + this.currentPhase = 0; + this.todos = []; + this.viewMode = "chat"; + this._connectSSE(); + } else if (data.messages) { + // Saved session — show read-only + this.sessionId = sessionId; + this.dataDirs = dirs; + this.connected = false; + this.isReadOnly = true; + this.messages = data.messages; + this.viewMode = "chat"; + // Rebuild phase + todo state from saved messages + this._rebuildTodoState(data.messages); + } + } catch { + // ignore + } + } + + async _deleteSession(e, sessionId) { + e.stopPropagation(); // Don't trigger card click + try { + await fetch(new URL(`/ai/sessions/${sessionId}?delete_history=true`, baseUrl), { + method: "DELETE", + }); + this.savedSessions = this.savedSessions.filter((s) => s.session_id !== sessionId); + } catch { + // ignore + } + } + + _backToList() { + // If we have an active connection, don't kill it — just go back + if (this.connected) { + // Keep the session alive in the background + } + this._closeEventSource(); + this.viewMode = "list"; + this.isReadOnly = false; + this._loadSessions(); // refresh the list + } + + async _browseFolder() { + try { + const { electron } = await import("../../../../utils/electron"); + if (electron?.ipcRenderer) { + const result = await electron.ipcRenderer.invoke("showOpenDialog", { + properties: ["openDirectory"], + title: "Select Data Folder", + }); + if (result && !result.canceled && result.filePaths?.length) { + const dir = result.filePaths[0]; + if (!this.dataDirs.includes(dir)) { + this.dataDirs = [...this.dataDirs, dir]; + } + } + } + } catch { + // Fallback: user types the path manually + } + } + + _addFolder() { + const dir = this._dirInput.trim(); + if (!dir) return; + if (!this.dataDirs.includes(dir)) { + this.dataDirs = [...this.dataDirs, dir]; + } + this._dirInput = ""; + this.requestUpdate(); + } + + _removeFolder(index) { + this.dataDirs = this.dataDirs.filter((_, i) => i !== index); + } + + async _startSession() { + if (this.dataDirs.length === 0 || this.connected || this._starting) return; + this._starting = true; + this.requestUpdate(); + + const settingsPanel = this.querySelector("nwbguide-ai-settings"); + const settings = settingsPanel?.getSettings() || {}; + + try { + const resp = await fetch(new URL("/ai/sessions", baseUrl), { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify({ + data_dirs: this.dataDirs, + api_key: settings.apiKey, + model: settings.model, + }), + }); + + if (!resp.ok) { + const err = await resp.json(); + this._addMessage("error", err.message || "Failed to create session"); + this._starting = false; + return; + } + + const data = await resp.json(); + this.sessionId = data.session_id; + this.authMode = data.auth_mode || null; + + this._connectSSE(); + + await this._waitForConnection(); + this.connected = true; + this._starting = false; + this.currentPhase = 1; // Phase 1 starts immediately + + this._addMessage("assistant", [ + { + type: "text", + text: "Connected! I'm ready to help you convert your data to NWB. Let me start by inspecting your data...", + }, + ]); + + const dirList = this.dataDirs.map((d) => ` - ${d}`).join("\n"); + this._sendToAgent( + `I'd like to convert my neurophysiology data to NWB format. My data is located at:\n${dirList}` + ); + } catch (e) { + this._starting = false; + this._addMessage("error", `Connection failed: ${e.message}`); + } + } + + async _waitForConnection(maxWaitMs = 30000) { + const interval = 500; + let elapsed = 0; + while (elapsed < maxWaitMs) { + try { + const resp = await fetch(new URL(`/ai/sessions/${this.sessionId}`, baseUrl)); + if (resp.ok) { + const data = await resp.json(); + if (data.connected) return; + } + } catch { + // ignore fetch errors during polling + } + await new Promise((r) => setTimeout(r, interval)); + elapsed += interval; + } + throw new Error("Agent did not connect in time."); + } + + _connectSSE() { + if (this._eventSource) this._closeEventSource(); + + const url = new URL(`/ai/sessions/${this.sessionId}/events`, baseUrl); + this._eventSource = new EventSource(url); + + this._eventSource.onmessage = (event) => { + try { + const data = JSON.parse(event.data); + this._handleSSEEvent(data); + } catch { + // Ignore parse errors from keepalives + } + }; + + this._eventSource.onerror = () => { + // EventSource will auto-reconnect + }; + } + + _handleSSEEvent(data) { + if (data.type === "assistant") { + this._mergeAssistantContent(data.content); + this._detectPhaseTransition(data.content); + } else if (data.type === "error") { + const content = data.content || ""; + if (content.includes("429") || content.toLowerCase().includes("budget exceeded")) { + this._addMessage( + "error", + "Free credits for this session have been used. Enter an API key in Settings to continue." + ); + } else { + this._addMessage("error", content); + } + this.isStreaming = false; + } else if (data.type === "result") { + this.isStreaming = false; + if (data.is_error) { + this._addMessage("error", data.result || "Agent encountered an error."); + } + } else if (data.type === "done") { + this.isStreaming = false; + } + + this._scrollToBottom(); + } + + // Phase keyword patterns for inferring which phase a task belongs to + static PHASE_KEYWORDS = [ + /* 1 */ /\b(experiment|intake|discover|species|modality|modalities|publication|lab\b|what.*record)/i, + /* 2 */ /\b(inspect|scan|file.?format|interface|neuroconv|data.?inspection|directory|file.?type)/i, + /* 3 */ /\b(metadata|subject|session.?info|electrode|age|sex|genotype|experimenter)/i, + /* 4 */ /\b(sync|clock|timestamp|alignment|synchroniz)/i, + /* 5 */ /\b(code.?gen|convert|script|pip.?install|converter|write.*code|generate.*code)/i, + /* 6 */ /\b(test|valid|inspector|nwbinspector|stub|verif)/i, + /* 7 */ /\b(dandi|upload|dandiset|publish|archive)/i, + ]; + + _inferPhase(text, metadata) { + // 1. Explicit phase in metadata (e.g., TaskCreate with metadata.phase) + if (metadata?.phase) { + const p = parseInt(metadata.phase, 10); + if (p >= 1 && p <= 7) return p; + } + + // 2. Explicit "Phase N" in the text itself + const explicitMatch = text.match(/\bphase\s+(\d)\b/i); + if (explicitMatch) { + const p = parseInt(explicitMatch[1], 10); + if (p >= 1 && p <= 7) return p; + } + + // 3. Keyword matching against phase themes + const lower = text.toLowerCase(); + for (let i = 0; i < AIAssistantPage.PHASE_KEYWORDS.length; i++) { + if (AIAssistantPage.PHASE_KEYWORDS[i].test(lower)) { + return i + 1; + } + } + + // 4. Fall back to current phase + return this.currentPhase; + } + + _detectPhaseTransition(content) { + if (!Array.isArray(content)) return; + + for (const block of content) { + // Detect phase headers from text + if (block.type === "text") { + // Match various phase header patterns: + // "Phase 2: Data Inspection", "### Phase 3 — Metadata", "Moving to Phase 4" + const phaseRegex = /(?:^#+\s*)?(?:Phase|phase)\s+(\d)\s*[:.—\-–\s]+(.+?)(?:\n|$)/gm; + let phaseMatch; + while ((phaseMatch = phaseRegex.exec(block.text)) !== null) { + const phaseNum = parseInt(phaseMatch[1], 10); + if (phaseNum >= 1 && phaseNum <= 7 && phaseNum > this.currentPhase) { + this.currentPhase = phaseNum; + this._addMessage("phase", `Phase ${phaseMatch[1]}: ${phaseMatch[2].trim()}`); + } + } + + // Parse checklist items: - [ ] todo or - [x] done + const todoRegex = /^[-*]\s+\[([ xX])\]\s+(.+)$/gm; + let match; + while ((match = todoRegex.exec(block.text)) !== null) { + const done = match[1].toLowerCase() === "x"; + const text = match[2].trim(); + this._upsertTodo(text, done, this._inferPhase(text, null)); + } + } + + if (block.type !== "tool_use") continue; + + // TodoWrite: input.todos is an array of {id, content, status} + if (block.name === "TodoWrite") { + const todos = block.input?.todos; + if (Array.isArray(todos)) { + for (const item of todos) { + const text = item.content || item.subject || item.task || ""; + if (!text) continue; + const done = item.status === "completed"; + this._upsertTodo(text, done, this._inferPhase(text, item.metadata)); + if (item.id) this._todoIdMap.set(item.id, text); + } + } + } + + // TaskCreate: input.subject is the task title + if (block.name === "TaskCreate") { + const subject = block.input?.subject || block.input?.task || ""; + if (subject) { + const desc = block.input?.description || ""; + const phase = this._inferPhase(`${subject} ${desc}`, block.input?.metadata); + this._upsertTodo(subject, false, phase); + } + } + + // TaskUpdate: match by taskId to mark done + if (block.name === "TaskUpdate") { + const status = block.input?.status; + const taskId = block.input?.taskId || block.input?.id; + if (status === "completed" && taskId) { + const mappedText = this._todoIdMap.get(taskId); + if (mappedText) { + this._upsertTodo(mappedText, true, null); + } else { + const idx = parseInt(taskId, 10) - 1; + if (idx >= 0 && idx < this.todos.length) { + const updated = [...this.todos]; + updated[idx] = { ...updated[idx], done: true }; + this.todos = updated; + } + } + } + } + } + } + + _upsertTodo(text, done, phase) { + const existing = this.todos.findIndex((t) => t.text === text); + if (existing >= 0) { + const updated = [...this.todos]; + // Keep existing phase unless a more specific one is provided + const existingPhase = updated[existing].phase; + const newPhase = phase || existingPhase; + updated[existing] = { ...updated[existing], done, phase: newPhase }; + this.todos = updated; + } else { + this.todos = [...this.todos, { text, done, phase }]; + } + } + + async _onSendMessage(e) { + const text = e.detail; + if (this.isStreaming) { + await this._interrupt(); + } + this._addMessage("user", text); + this._sendToAgent(text); + this._scrollToBottom(); + } + + async _onChoiceSelected(e) { + const choice = e.detail; + if (!this.connected) return; + if (this.isStreaming) { + await this._interrupt(); + } + this._addMessage("user", choice); + this._sendToAgent(choice); + this._scrollToBottom(); + } + + async _interrupt() { + if (!this.sessionId) return; + try { + await fetch(new URL(`/ai/sessions/${this.sessionId}/interrupt`, baseUrl), { + method: "POST", + }); + this.isStreaming = false; + } catch { + // ignore + } + } + + async _sendToAgent(content) { + if (!this.sessionId) return; + + this.isStreaming = true; + + try { + await fetch(new URL(`/ai/sessions/${this.sessionId}/message`, baseUrl), { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify({ content }), + }); + } catch (e) { + this._addMessage("error", `Failed to send message: ${e.message}`); + this.isStreaming = false; + } + } + + _mergeAssistantContent(content) { + if (!Array.isArray(content)) { + this._addMessage("assistant", content); + return; + } + + const hasOnlyResults = content.every((b) => b.type === "tool_result"); + + if (hasOnlyResults) { + const updated = [...this.messages]; + for (let i = updated.length - 1; i >= 0; i--) { + const msg = updated[i]; + if (msg.role === "assistant" && Array.isArray(msg.content)) { + const hasToolUse = msg.content.some((b) => b.type === "tool_use"); + if (hasToolUse) { + updated[i] = { ...msg, content: [...msg.content, ...content] }; + this.messages = updated; + return; + } + } + } + } + + this._addMessage("assistant", content); + } + + _addMessage(role, content) { + this.messages = [...this.messages, { role, content }]; + } + + _scrollToBottom() { + requestAnimationFrame(() => { + const container = this.querySelector("#ai-messages"); + if (container) { + container.scrollTop = container.scrollHeight; + } + }); + } + + async _newConversation() { + if (this.sessionId) { + try { + await fetch(new URL(`/ai/sessions/${this.sessionId}`, baseUrl), { + method: "DELETE", + }); + } catch { + // ignore + } + } + this._closeEventSource(); + + this.messages = []; + this.sessionId = null; + this.dataDirs = []; + this._dirInput = ""; + this.connected = false; + this.isStreaming = false; + this.isReadOnly = false; + this.authMode = null; + this.currentPhase = 0; + this.todos = []; + this._starting = false; + this._todoIdMap = new Map(); + this.viewMode = "list"; + this._loadSessions(); + } + + _closeEventSource() { + if (this._eventSource) { + this._eventSource.close(); + this._eventSource = null; + } + } + + _rebuildTodoState(messages) { + let phase = 1; // Phase 1 is active from the start + const todoMap = new Map(); // text -> { done, phase } + const idMap = new Map(); // TodoWrite id -> text + + // Local version of _inferPhase that uses `phase` variable instead of this.currentPhase + const inferPhase = (text, metadata) => { + if (metadata?.phase) { + const p = parseInt(metadata.phase, 10); + if (p >= 1 && p <= 7) return p; + } + const explicitMatch = text.match(/\bphase\s+(\d)\b/i); + if (explicitMatch) { + const p = parseInt(explicitMatch[1], 10); + if (p >= 1 && p <= 7) return p; + } + const lower = text.toLowerCase(); + for (let i = 0; i < AIAssistantPage.PHASE_KEYWORDS.length; i++) { + if (AIAssistantPage.PHASE_KEYWORDS[i].test(lower)) return i + 1; + } + return phase; + }; + + for (const msg of messages) { + if (msg.role !== "assistant" || !Array.isArray(msg.content)) continue; + + for (const block of msg.content) { + if (block.type === "text") { + // Phases — broader regex + const phaseRegex = /(?:^#+\s*)?(?:Phase|phase)\s+(\d)\s*[:.—\-–\s]+(.+?)(?:\n|$)/gm; + let phaseMatch; + while ((phaseMatch = phaseRegex.exec(block.text)) !== null) { + const num = parseInt(phaseMatch[1], 10); + if (num >= 1 && num <= 7 && num > phase) phase = num; + } + + // Checklist items + const todoRegex = /^[-*]\s+\[([ xX])\]\s+(.+)$/gm; + let m; + while ((m = todoRegex.exec(block.text)) !== null) { + const done = m[1].toLowerCase() === "x"; + const text = m[2].trim(); + const prev = todoMap.get(text); + todoMap.set(text, { done, phase: prev?.phase || inferPhase(text, null) }); + } + } + + if (block.type !== "tool_use") continue; + + // `TodoWrite`: `input.todos` is an array of `{id, content, status}` + if (block.name === "TodoWrite") { + const todos = block.input?.todos; + if (Array.isArray(todos)) { + for (const item of todos) { + const text = item.content || item.subject || item.task || ""; + if (!text) continue; + const done = item.status === "completed"; + const prev = todoMap.get(text); + todoMap.set(text, { done, phase: prev?.phase || inferPhase(text, item.metadata) }); + if (item.id) idMap.set(item.id, text); + } + } + } + + // `TaskCreate`: `input.subject` is the task title + if (block.name === "TaskCreate") { + const subject = block.input?.subject || block.input?.task || ""; + if (subject) { + const desc = block.input?.description || ""; + const prev = todoMap.get(subject); + todoMap.set(subject, { + done: prev?.done || false, + phase: prev?.phase || inferPhase(`${subject} ${desc}`, block.input?.metadata), + }); + } + } + + // `TaskUpdate`: match by `taskId` + if (block.name === "TaskUpdate") { + const status = block.input?.status; + const taskId = block.input?.taskId || block.input?.id; + if (status === "completed" && taskId) { + const mappedText = idMap.get(taskId); + if (mappedText) { + const prev = todoMap.get(mappedText); + todoMap.set(mappedText, { ...prev, done: true }); + } + } + } + } + } + + this.currentPhase = phase; + this._todoIdMap = idMap; + this.todos = [...todoMap.entries()].map(([text, { done, phase: p }]) => ({ text, done, phase: p })); + } + + // ── Helpers ───────────────────────────────────────────────────────── + + _formatDate(isoStr) { + if (!isoStr) return ""; + try { + const d = new Date(isoStr); + const now = new Date(); + const diffMs = now - d; + const diffMin = Math.floor(diffMs / 60000); + const diffHr = Math.floor(diffMs / 3600000); + const diffDay = Math.floor(diffMs / 86400000); + + if (diffMin < 1) return "just now"; + if (diffMin < 60) return `${diffMin}m ago`; + if (diffHr < 24) return `${diffHr}h ago`; + if (diffDay < 7) return `${diffDay}d ago`; + return d.toLocaleDateString(); + } catch { + return ""; + } + } + + _shortDir(dirPath) { + if (!dirPath) return ""; + const parts = dirPath.split("/").filter(Boolean); + return parts.length > 2 ? ".../" + parts.slice(-2).join("/") : dirPath; + } +} + +customElements.get("nwbguide-ai-assistant-page") || + customElements.define("nwbguide-ai-assistant-page", AIAssistantPage); diff --git a/src/electron/frontend/core/components/pages/ai-assistant/ChatInput.js b/src/electron/frontend/core/components/pages/ai-assistant/ChatInput.js new file mode 100644 index 000000000..d2e8c1065 --- /dev/null +++ b/src/electron/frontend/core/components/pages/ai-assistant/ChatInput.js @@ -0,0 +1,123 @@ +import { LitElement, html, css } from "lit"; + +/** + * Text input with send button for the chat interface. + * + * Fires a "send-message" custom event with the message text in `detail`. + */ +export class ChatInput extends LitElement { + static properties = { + disabled: { type: Boolean }, + placeholder: { type: String }, + }; + + static styles = css` + :host { + display: block; + } + + .input-row { + display: flex; + gap: 8px; + align-items: flex-end; + } + + textarea { + flex: 1; + resize: none; + border: 1px solid #ccc; + border-radius: 8px; + padding: 10px 12px; + font-family: inherit; + font-size: 0.95em; + line-height: 1.4; + min-height: 40px; + max-height: 120px; + outline: none; + transition: border-color 0.2s; + } + + textarea:focus { + border-color: #1976d2; + } + + textarea:disabled { + background: #f5f5f5; + cursor: not-allowed; + } + + button { + background: #1976d2; + color: white; + border: none; + border-radius: 8px; + padding: 10px 20px; + cursor: pointer; + font-size: 0.95em; + font-weight: 500; + white-space: nowrap; + transition: background 0.2s; + } + + button:hover:not(:disabled) { + background: #1565c0; + } + + button:disabled { + background: #bbb; + cursor: not-allowed; + } + `; + + constructor() { + super(); + this.disabled = false; + this.placeholder = "Type your message..."; + } + + render() { + return html` +
+ + +
+ `; + } + + _onKeyDown(e) { + // Auto-resize textarea + const textarea = e.target; + textarea.style.height = "auto"; + textarea.style.height = Math.min(textarea.scrollHeight, 120) + "px"; + + // Submit on Enter (without Shift) + if (e.key === "Enter" && !e.shiftKey) { + e.preventDefault(); + this._onSend(); + } + } + + _onSend() { + const textarea = this.shadowRoot.querySelector("textarea"); + const text = textarea.value.trim(); + if (!text || this.disabled) return; + + this.dispatchEvent( + new CustomEvent("send-message", { + detail: text, + bubbles: true, + composed: true, + }) + ); + + textarea.value = ""; + textarea.style.height = "auto"; + } +} + +customElements.get("nwbguide-chat-input") || customElements.define("nwbguide-chat-input", ChatInput); diff --git a/src/electron/frontend/core/components/pages/ai-assistant/ChatMessage.js b/src/electron/frontend/core/components/pages/ai-assistant/ChatMessage.js new file mode 100644 index 000000000..950c884a0 --- /dev/null +++ b/src/electron/frontend/core/components/pages/ai-assistant/ChatMessage.js @@ -0,0 +1,883 @@ +import { LitElement, html, css } from "lit"; +import { unsafeHTML } from "lit/directives/unsafe-html.js"; +import { marked } from "marked"; + +/** + * Renders a single chat message (user, assistant, or tool-use). + * + * @property {Object} message - The message object with `role` and `content`. + * role: "user" | "assistant" | "phase" | "error" + * content: string | Array<{type, text?, name?, input?, content?}> + */ +export class ChatMessage extends LitElement { + static properties = { + message: { type: Object }, + }; + + static styles = css` + :host { + display: block; + margin-bottom: 12px; + } + + .message { + padding: 10px 14px; + border-radius: 8px; + max-width: 85%; + line-height: 1.5; + word-wrap: break-word; + } + + .user { + background: #e3f2fd; + margin-left: auto; + text-align: right; + border-bottom-right-radius: 2px; + white-space: pre-wrap; + } + + .assistant { + background: #f5f5f5; + margin-right: auto; + border-bottom-left-radius: 2px; + } + + .error { + background: #ffebee; + color: #c62828; + margin-right: auto; + border-bottom-left-radius: 2px; + } + + .phase-divider { + text-align: center; + color: #666; + font-size: 0.85em; + font-weight: 600; + padding: 8px 0; + border-top: 1px solid #e0e0e0; + border-bottom: 1px solid #e0e0e0; + margin: 8px 0; + } + + .tool-card { + background: #fafafa; + border: 1px solid #e0e0e0; + border-radius: 6px; + padding: 4px 10px; + margin: 2px 0; + font-size: 0.85em; + } + + .tool-card summary { + cursor: pointer; + font-weight: 500; + color: #555; + } + + .tool-card pre { + margin: 2px 0 4px; + padding: 6px; + background: #f0f0f0; + border-radius: 4px; + overflow-x: auto; + font-size: 0.9em; + max-height: 200px; + overflow-y: auto; + } + + .tool-card pre.tool-error { + background: #ffebee; + color: #c62828; + } + + .tool-summary { + color: #888; + font-weight: 400; + } + + .tool-error-badge { + color: #c62828; + font-size: 0.8em; + font-weight: 600; + } + + .tool-name { + font-weight: 600; + color: #555; + } + + .tool-code { + margin: 2px 0 4px; + padding: 6px 8px; + background: #f8f8f8; + color: #1a1a1a; + border: 1px solid #e0e0e0; + border-radius: 4px; + overflow-x: auto; + font-size: 0.9em; + max-height: 200px; + overflow-y: auto; + } + + .tool-code .hl-kw { + color: #8839ef; + } + .tool-code .hl-bi { + color: #d20f39; + } + .tool-code .hl-str { + color: #40a02b; + } + .tool-code .hl-num { + color: #fe640b; + } + .tool-code .hl-cmt { + color: #8c8fa1; + font-style: italic; + } + .tool-code .hl-op { + color: #1a1a1a; + } + .tool-code .hl-dec { + color: #e64553; + } + .tool-code .hl-cls { + color: #1e66f5; + } + + .tool-diff { + display: flex; + flex-direction: column; + gap: 2px; + } + + .tool-diff-old { + margin: 2px 0; + padding: 4px 8px; + background: #ffeef0; + color: #b31d28; + border-radius: 4px; + font-size: 0.9em; + max-height: 150px; + overflow: auto; + border-left: 3px solid #d73a49; + } + + .tool-diff-new { + margin: 2px 0; + padding: 4px 8px; + background: #e6ffed; + color: #22863a; + border-radius: 4px; + font-size: 0.9em; + max-height: 150px; + overflow: auto; + border-left: 3px solid #28a745; + } + + .tool-section-label { + font-size: 0.75em; + color: #999; + margin-top: 4px; + text-transform: uppercase; + letter-spacing: 0.5px; + } + + .text-block { + line-height: 1.5; + } + + .text-block p { + margin: 0.4em 0; + } + + .text-block p:first-child { + margin-top: 0; + } + + .text-block p:last-child { + margin-bottom: 0; + } + + .text-block code { + background: #e8e8e8; + padding: 1px 4px; + border-radius: 3px; + font-size: 0.9em; + } + + .text-block pre { + background: #f8f8f8; + border: 1px solid #e0e0e0; + border-radius: 4px; + padding: 6px 8px; + overflow-x: auto; + font-size: 0.9em; + max-height: 200px; + overflow-y: auto; + } + + .text-block pre code { + background: none; + padding: 0; + } + + .text-block ul, + .text-block ol { + margin: 0.4em 0; + padding-left: 1.5em; + } + + .text-block li { + margin: 0.2em 0; + } + + .text-block h1, + .text-block h2, + .text-block h3, + .text-block h4 { + margin: 0.6em 0 0.3em; + line-height: 1.3; + } + + .text-block h1 { + font-size: 1.2em; + } + .text-block h2 { + font-size: 1.1em; + } + .text-block h3 { + font-size: 1em; + } + + .text-block blockquote { + border-left: 3px solid #ccc; + margin: 0.4em 0; + padding: 0.2em 0.8em; + color: #555; + } + + .text-block table { + border-collapse: collapse; + margin: 0.4em 0; + font-size: 0.9em; + } + + .text-block th, + .text-block td { + border: 1px solid #ddd; + padding: 4px 8px; + } + + .text-block th { + background: #f0f0f0; + font-weight: 600; + } + + .text-block a { + color: #1976d2; + } + + .text-block strong { + font-weight: 600; + } + + .label { + font-size: 0.75em; + color: #888; + margin-bottom: 4px; + font-weight: 500; + } + + .choices { + display: flex; + flex-wrap: wrap; + gap: 8px; + margin: 8px 0 4px; + } + + .choice-btn { + padding: 8px 16px; + border: 1px solid #90caf9; + border-radius: 20px; + background: #e3f2fd; + color: #1565c0; + cursor: pointer; + font-size: 0.88em; + line-height: 1.4; + transition: + background 0.15s, + border-color 0.15s; + text-align: left; + } + + .choice-btn:hover { + background: #bbdefb; + border-color: #42a5f5; + } + + .choice-btn:active { + background: #90caf9; + } + + .choices-answered .choice-btn { + opacity: 0.5; + cursor: default; + pointer-events: none; + } + + .choices-answered .choice-btn.selected { + opacity: 1; + background: #1976d2; + color: white; + border-color: #1976d2; + } + `; + + render() { + const { role, content } = this.message || {}; + + if (role === "phase") { + return html`
${content}
`; + } + + if (role === "error") { + return html` +
Error
+
${content}
+ `; + } + + if (role === "user") { + return html`
${content}
`; + } + + // Assistant message — content is an array of blocks + if (role === "assistant" && Array.isArray(content)) { + // Build a map of tool_use_id -> tool_result for pairing + const resultMap = {}; + for (const block of content) { + if (block.type === "tool_result") { + resultMap[block.tool_use_id] = block; + } + } + return html` +
+ ${content + .filter((block) => block.type !== "tool_result") + .map((block) => this._renderBlock(block, resultMap))} +
+ `; + } + + // Fallback for plain text assistant + return html`
${content}
`; + } + + _renderBlock(block, resultMap = {}) { + if (block.type === "text") { + // Check for blocks + const choicesMatch = block.text.match(/([\s\S]*?)<\/choices>/); + if (choicesMatch) { + const textBefore = block.text.slice(0, choicesMatch.index).trim(); + const textAfter = block.text.slice(choicesMatch.index + choicesMatch[0].length).trim(); + const options = this._parseChoices(choicesMatch[1]); + + return html` + ${textBefore + ? html`
${unsafeHTML(this._renderMarkdown(textBefore))}
` + : ""} +
+ ${options.map( + (opt) => html` + + ` + )} +
+ ${textAfter + ? html`
${unsafeHTML(this._renderMarkdown(textAfter))}
` + : ""} + `; + } + + return html`
${unsafeHTML(this._renderMarkdown(block.text))}
`; + } + + if (block.type === "tool_use") { + const result = resultMap[block.id]; + const resultPreview = result + ? typeof result.content === "string" + ? result.content.slice(0, 2000) + : JSON.stringify(result.content).slice(0, 2000) + : null; + + return html` +
+ + ${this._renderToolSummary(block)} + ${result?.is_error ? html` error` : ""} + + ${this._renderToolInput(block)} + ${resultPreview != null + ? html` + +
${resultPreview}
+ ` + : ""} +
+ `; + } + + return html``; + } + + _renderToolSummary(block) { + const { name, input } = block; + if (!input) return name; + + if (name === "Bash") { + const cmd = input.command || ""; + // Show first line or first 80 chars + const firstLine = cmd.split("\n")[0].slice(0, 80); + return html`$ + ${firstLine}${cmd.length > 80 || cmd.includes("\n") ? "..." : ""}`; + } + if (name === "Read") + return html`Read + ${this._shortPath(input.file_path)}`; + if (name === "Write") + return html`Write + ${this._shortPath(input.file_path)}`; + if (name === "Edit") + return html`Edit + ${this._shortPath(input.file_path)}`; + if (name === "Glob") + return html`Glob ${input.pattern}`; + if (name === "Grep") + return html`Grep ${input.pattern}`; + return name; + } + + _renderToolInput(block) { + const { name, input } = block; + if (!input) return html``; + + if (name === "Bash") { + const code = input.command || ""; + return html`
${unsafeHTML(this._highlightCode(code, "shell"))}
`; + } + + if (name === "Write") { + const content = input.content || ""; + const snippet = content.slice(0, 2000) + (content.length > 2000 ? "\n..." : ""); + const lang = this._detectLang(snippet, input.file_path); + return html` + +
${unsafeHTML(this._highlightCode(snippet, lang))}
+ `; + } + + if (name === "Edit") { + const lang = this._detectLang(input.new_string || "", input.file_path); + return html` + +
+
${unsafeHTML(this._highlightCode(input.old_string || "", lang))}
+
${unsafeHTML(this._highlightCode(input.new_string || "", lang))}
+
+ `; + } + + // Default: show as JSON + return html`
${JSON.stringify(input, null, 2)}
`; + } + + _detectLang(code, filePath = "") { + if (filePath.endsWith(".py") || filePath.endsWith(".pyi")) return "python"; + if (filePath.endsWith(".js") || filePath.endsWith(".ts")) return "js"; + if (filePath.endsWith(".yml") || filePath.endsWith(".yaml")) return "yaml"; + // Detect from content + if (/^python3?\s|^#!.*python|^\s*(import |from |def |class )/.test(code)) return "python"; + if (/^\s*(const |let |var |function |import )/.test(code)) return "js"; + return "shell"; + } + + _highlightCode(code, lang = "shell") { + // Single-pass tokenizer — avoids nested regex issues + const tokens = this._tokenize(code, lang); + return tokens + .map(([type, text]) => { + const esc = text.replace(/&/g, "&").replace(//g, ">"); + if (type === "plain") return esc; + return `${esc}`; + }) + .join(""); + } + + _tokenize(code, lang) { + const PY_KW = new Set([ + "False", + "None", + "True", + "and", + "as", + "assert", + "async", + "await", + "break", + "class", + "continue", + "def", + "del", + "elif", + "else", + "except", + "finally", + "for", + "from", + "global", + "if", + "import", + "in", + "is", + "lambda", + "nonlocal", + "not", + "or", + "pass", + "raise", + "return", + "try", + "while", + "with", + "yield", + ]); + const PY_BI = new Set([ + "print", + "len", + "range", + "type", + "int", + "str", + "float", + "list", + "dict", + "set", + "tuple", + "open", + "super", + "isinstance", + "hasattr", + "getattr", + "setattr", + "enumerate", + "zip", + "map", + "filter", + "sorted", + "reversed", + "any", + "all", + "min", + "max", + "sum", + "abs", + "round", + "input", + "format", + "id", + "hex", + "oct", + "bin", + "chr", + "ord", + "repr", + "hash", + "dir", + "vars", + "globals", + "locals", + "staticmethod", + "classmethod", + "property", + "Path", + "Union", + ]); + const JS_KW = new Set([ + "const", + "let", + "var", + "function", + "return", + "if", + "else", + "for", + "while", + "do", + "switch", + "case", + "break", + "continue", + "new", + "this", + "class", + "extends", + "import", + "export", + "from", + "default", + "async", + "await", + "try", + "catch", + "finally", + "throw", + "typeof", + "instanceof", + "of", + "in", + "yield", + ]); + const JS_BI = new Set([ + "console", + "document", + "window", + "Array", + "Object", + "String", + "Number", + "Boolean", + "Map", + "Set", + "Promise", + "JSON", + "Math", + "Date", + "Error", + "RegExp", + "parseInt", + "parseFloat", + "setTimeout", + "setInterval", + "fetch", + "require", + ]); + const SH_KW = new Set([ + "if", + "then", + "else", + "elif", + "fi", + "for", + "do", + "done", + "while", + "until", + "case", + "esac", + "function", + "in", + "export", + "source", + "alias", + "cd", + "echo", + "exit", + "pwd", + "read", + "set", + "unset", + "local", + "readonly", + "declare", + "eval", + "exec", + "trap", + "wait", + "kill", + "test", + "true", + "false", + ]); + + const kw = lang === "python" ? PY_KW : lang === "js" ? JS_KW : SH_KW; + const bi = lang === "python" ? PY_BI : lang === "js" ? JS_BI : new Set(); + + const tokens = []; + let i = 0; + const len = code.length; + + while (i < len) { + const ch = code[i]; + const rest = code.slice(i); + + // Comments + if (ch === "#" && lang !== "js") { + const end = code.indexOf("\n", i); + const cmt = end === -1 ? code.slice(i) : code.slice(i, end); + tokens.push(["cmt", cmt]); + i += cmt.length; + continue; + } + if (lang === "js" && rest.startsWith("//")) { + const end = code.indexOf("\n", i); + const cmt = end === -1 ? code.slice(i) : code.slice(i, end); + tokens.push(["cmt", cmt]); + i += cmt.length; + continue; + } + if (lang === "js" && rest.startsWith("/*")) { + const end = code.indexOf("*/", i + 2); + const cmt = end === -1 ? code.slice(i) : code.slice(i, end + 2); + tokens.push(["cmt", cmt]); + i += cmt.length; + continue; + } + + // Triple-quoted strings (Python) + if (lang === "python" && (rest.startsWith('"""') || rest.startsWith("'''"))) { + const q = rest.slice(0, 3); + const end = code.indexOf(q, i + 3); + const s = end === -1 ? code.slice(i) : code.slice(i, end + 3); + tokens.push(["str", s]); + i += s.length; + continue; + } + + // Strings + if (ch === '"' || ch === "'" || (ch === "`" && lang === "js")) { + // Check for f-string prefix + let start = i; + if (lang === "python" && i > 0 && (code[i - 1] === "f" || code[i - 1] === "r" || code[i - 1] === "b")) { + // Already consumed the prefix as part of a word — handled below + } + const quote = ch; + let j = i + 1; + while (j < len) { + if (code[j] === "\\") { + j += 2; + continue; + } + if (code[j] === quote) { + j++; + break; + } + j++; + } + tokens.push(["str", code.slice(i, j)]); + i = j; + continue; + } + + // f/r/b string prefixes (Python) + if ( + lang === "python" && + (ch === "f" || ch === "r" || ch === "b") && + i + 1 < len && + (code[i + 1] === '"' || code[i + 1] === "'") + ) { + const quote = code[i + 1]; + // Check triple + if (i + 3 < len && code[i + 2] === quote && code[i + 3] === quote) { + // Prefixed triple quote -- skip for simplicity, rare + } + let j = i + 2; + while (j < len) { + if (code[j] === "\\") { + j += 2; + continue; + } + if (code[j] === quote) { + j++; + break; + } + j++; + } + tokens.push(["str", code.slice(i, j)]); + i = j; + continue; + } + + // Decorators (Python) + if (lang === "python" && ch === "@" && (i === 0 || code[i - 1] === "\n")) { + const end = code.indexOf("\n", i); + const dec = end === -1 ? code.slice(i) : code.slice(i, end); + tokens.push(["dec", dec]); + i += dec.length; + continue; + } + + // Numbers + if (/\d/.test(ch) && (i === 0 || !/\w/.test(code[i - 1]))) { + let j = i; + while (j < len && /[\d.eE_xXoObBaAfF+-]/.test(code[j])) j++; + tokens.push(["num", code.slice(i, j)]); + i = j; + continue; + } + + // Words (keywords, builtins, identifiers) + if (/[a-zA-Z_]/.test(ch)) { + let j = i; + while (j < len && /\w/.test(code[j])) j++; + const word = code.slice(i, j); + if (kw.has(word)) tokens.push(["kw", word]); + else if (bi.has(word)) tokens.push(["bi", word]); + else tokens.push(["plain", word]); + i = j; + continue; + } + + // Everything else + tokens.push(["plain", ch]); + i++; + } + + return tokens; + } + + _parseChoices(raw) { + // Parse ... tags, or fall back to line-based parsing + const tagMatches = [...raw.matchAll(/([\s\S]*?)<\/choice>/g)]; + if (tagMatches.length > 0) { + return tagMatches.map((m) => m[1].trim()).filter(Boolean); + } + // Fall back: each non-empty line is a choice (strip leading - or *) + return raw + .split("\n") + .map((line) => line.replace(/^\s*[-*]\s*/, "").trim()) + .filter(Boolean); + } + + _onChoiceClick(option, block) { + if (block._answered) return; + block._answered = true; + block._selectedChoice = option; + this.requestUpdate(); + this.dispatchEvent( + new CustomEvent("choice-selected", { + detail: option, + bubbles: true, + composed: true, + }) + ); + } + + _renderMarkdown(text) { + return marked.parse(text, { breaks: true, gfm: true }); + } + + _shortPath(filePath) { + if (!filePath) return ""; + const parts = filePath.split("/"); + return parts.length > 3 ? ".../" + parts.slice(-3).join("/") : filePath; + } +} + +customElements.get("nwbguide-chat-message") || customElements.define("nwbguide-chat-message", ChatMessage); diff --git a/src/electron/frontend/core/components/pages/ai-assistant/SettingsPanel.js b/src/electron/frontend/core/components/pages/ai-assistant/SettingsPanel.js new file mode 100644 index 000000000..f9cb8393a --- /dev/null +++ b/src/electron/frontend/core/components/pages/ai-assistant/SettingsPanel.js @@ -0,0 +1,171 @@ +import { LitElement, html, css } from "lit"; + +/** + * Inline settings panel for the AI assistant. + * Controls API key and model selection. + * + * Settings are persisted to localStorage. + */ +export class SettingsPanel extends LitElement { + static properties = { + open: { type: Boolean }, + apiKey: { type: String, attribute: false }, + model: { type: String, attribute: false }, + }; + + static STORAGE_KEY = "nwb-guide-ai-settings"; + + static styles = css` + :host { + display: block; + } + + .panel { + background: #fafafa; + border: 1px solid #e0e0e0; + border-radius: 8px; + padding: 16px; + margin-bottom: 12px; + } + + .panel[hidden] { + display: none; + } + + h4 { + margin: 0 0 12px; + font-size: 0.95em; + color: #333; + } + + .field { + margin-bottom: 12px; + } + + label { + display: block; + font-size: 0.85em; + font-weight: 500; + color: #555; + margin-bottom: 4px; + } + + input[type="text"], + input[type="password"], + select { + width: 100%; + padding: 8px 10px; + border: 1px solid #ccc; + border-radius: 6px; + font-size: 0.9em; + box-sizing: border-box; + } + + .hint { + font-size: 0.8em; + color: #888; + margin-top: 2px; + } + + .save-btn { + background: #1976d2; + color: white; + border: none; + border-radius: 6px; + padding: 8px 16px; + cursor: pointer; + font-size: 0.85em; + margin-top: 4px; + } + + .save-btn:hover { + background: #1565c0; + } + `; + + constructor() { + super(); + this.open = false; + this.apiKey = ""; + this.model = "claude-sonnet-4-5-20250929"; + this._loadSettings(); + } + + _loadSettings() { + try { + const raw = localStorage.getItem(SettingsPanel.STORAGE_KEY); + if (raw) { + const settings = JSON.parse(raw); + this.apiKey = settings.apiKey || ""; + this.model = settings.model || "claude-sonnet-4-5-20250929"; + } + } catch { + // Ignore parse errors + } + } + + _saveSettings() { + const settings = { + apiKey: this.apiKey, + model: this.model, + }; + localStorage.setItem(SettingsPanel.STORAGE_KEY, JSON.stringify(settings)); + + this.dispatchEvent( + new CustomEvent("settings-changed", { + detail: settings, + bubbles: true, + composed: true, + }) + ); + } + + getSettings() { + return { + apiKey: this.apiKey || null, + model: this.model, + }; + } + + render() { + return html` +
+

AI Assistant Settings

+ +
+ + { + this.apiKey = e.target.value; + }} + placeholder="sk-ant-..." + /> +
+ Leave blank to use your Claude subscription or free credits. Or get a key from + console.anthropic.com +
+
+ +
+ + +
+ + +
+ `; + } +} + +customElements.get("nwbguide-ai-settings") || customElements.define("nwbguide-ai-settings", SettingsPanel); diff --git a/src/electron/frontend/core/pages.js b/src/electron/frontend/core/pages.js index 3371f2795..dc7b82926 100644 --- a/src/electron/frontend/core/pages.js +++ b/src/electron/frontend/core/pages.js @@ -31,6 +31,7 @@ import { InspectPage } from "./components/pages/inspect/InspectPage"; import { PreviewPage } from "./components/pages/preview/PreviewPage"; import { GuidedPreform } from "./components/pages/guided-mode/setup/Preform"; import { GuidedDandiResultsPage } from "./components/pages/guided-mode/results/GuidedDandiResults"; +import { AIAssistantPage } from "./components/pages/ai-assistant/AIAssistantPage"; let dashboard = document.querySelector("nwb-dashboard"); if (!dashboard) dashboard = new Dashboard(); @@ -82,6 +83,19 @@ style="margin-right: 30px;" > `; +const aiAssistantIcon = ` + + + +`; + const pages = { "/": new GuidedHomePage({ label: "Convert", @@ -170,6 +184,10 @@ const pages = { }), }, }), + assistant: new AIAssistantPage({ + label: "AI Assistant", + icon: aiAssistantIcon, + }), validate: new InspectPage({ label: "Validate", icon: inspectIcon, diff --git a/src/electron/main/main.ts b/src/electron/main/main.ts index d50c4f16c..73ab84984 100755 --- a/src/electron/main/main.ts +++ b/src/electron/main/main.ts @@ -143,7 +143,8 @@ const createPyProc = async () => { .then(([freePort]: string[]) => { selectedPort = freePort; - pyflaskProcess = (serverFilePath.slice(-3) === '.py') ? child_process.spawn("python", [serverFilePath, freePort], {}) : child_process.spawn(`${serverFilePath}`, [freePort], {}); + const pythonPath = process.env.NWB_GUIDE_PYTHON || "python"; + pyflaskProcess = (serverFilePath.slice(-3) === '.py') ? child_process.spawn(pythonPath, [serverFilePath, freePort], {}) : child_process.spawn(`${serverFilePath}`, [freePort], {}); if (pyflaskProcess != null) { diff --git a/src/pyflask/ai/README.md b/src/pyflask/ai/README.md new file mode 100644 index 000000000..c5ea5b79b --- /dev/null +++ b/src/pyflask/ai/README.md @@ -0,0 +1,72 @@ +# AI Conversion Assistant + +This directory implements the AI-powered NWB conversion assistant in NWB GUIDE. It wraps the [nwb-convert skill](https://github.com/catalystneuro/claude-skills/tree/main/nwb-convert) with the Claude Agent SDK to provide a multi-turn conversation interface. + +## Architecture + +``` +ai/ + __init__.py + agent.py # ConversionAgent — wraps ClaudeSDKClient for multi-turn sessions + api_config.py # Three-tier auth: subscription → api_key → proxy + monitoring.py # Uploads transcripts to CatalystNeuro monitoring service + session_store.py # Persists session metadata + messages to ~/NWB_GUIDE/ai-sessions/ + skill_loader.py # Reads SKILL.md, expands $file: directives into system prompt + skill/ # Bundled copy of the nwb-convert skill (see below) +``` + +## Bundled Skill + +The `skill/` directory contains a copy of the canonical skill from `catalystneuro/claude-skills`. It includes: + +- `SKILL.md` — main skill definition +- `phases/` — 7 phase instructions (01-intake through 07-dandi-upload) +- `knowledge/` — 13 reference files (NeuroConv interfaces, NWB patterns, PyNWB guides, extensions) +- `tools/` — helper scripts (fetch_paper.py) + +`skill_loader.py` reads `SKILL.md` and expands `$file:` directives to produce the full system prompt. + +## Syncing from Canonical + +The canonical source of truth for the skill is: +``` +https://github.com/catalystneuro/claude-skills/tree/main/nwb-convert +``` + +To sync the bundled copy: +```bash +CANONICAL=~/dev/claude-skills-repo/nwb-convert +BUNDLED=~/dev/nwb-guide/src/pyflask/ai/skill + +cp "$CANONICAL/SKILL.md" "$BUNDLED/SKILL.md" +cp "$CANONICAL/phases/"*.md "$BUNDLED/phases/" +cp "$CANONICAL/knowledge/"*.md "$BUNDLED/knowledge/" +cp "$CANONICAL/knowledge/"*.yaml "$BUNDLED/knowledge/" +cp "$CANONICAL/tools/fetch_paper.py" "$BUNDLED/tools/" +``` + +After syncing, verify with: +```bash +diff -r "$CANONICAL" "$BUNDLED" --exclude='__pycache__' +``` + +The only expected difference: canonical has `nwb-data-model.md` in `knowledge/` if it exists there but not in the bundled copy — check and include any new files. + +## Hardcoded URLs + +These URLs appear in the Python modules and must be updated if services move: + +| File | URL | Purpose | +|------|-----|---------| +| `api_config.py` | `https://nwb-conversions-proxy.ben-dichter.workers.dev` | Free-tier API proxy | +| `monitoring.py` | `https://nwb-conversions-proxy.ben-dichter.workers.dev/monitoring` | Transcript monitoring | + +The infrastructure source code lives at [catalystneuro/nwb-conversions-infra](https://github.com/catalystneuro/nwb-conversions-infra). + +## Auth Modes + +`APIConfig` auto-detects three billing tiers (see `api_config.py`): + +1. **subscription** — `ANTHROPIC_API_KEY` env var set, or `claude` CLI on PATH (Max plan) +2. **api_key** — user entered their own API key in the GUIDE Settings UI +3. **proxy** — fallback to CatalystNeuro free-credit proxy ($5/session, $50/day caps) diff --git a/src/pyflask/ai/__init__.py b/src/pyflask/ai/__init__.py new file mode 100644 index 000000000..eec146e6f --- /dev/null +++ b/src/pyflask/ai/__init__.py @@ -0,0 +1 @@ +"""AI conversion assistant - wraps the nwb-convert skill with Claude Agent SDK.""" diff --git a/src/pyflask/ai/agent.py b/src/pyflask/ai/agent.py new file mode 100644 index 000000000..ea7d0246b --- /dev/null +++ b/src/pyflask/ai/agent.py @@ -0,0 +1,330 @@ +"""ConversionAgent wrapping ClaudeSDKClient for multi-turn NWB conversion conversations. + +Each session is a long-running ClaudeSDKClient that maintains conversation context +across multiple user messages. Responses are streamed to a queue consumed by the +SSE endpoint. +""" + +import asyncio +import logging +import queue +import threading + +from claude_agent_sdk import ( + AssistantMessage, + ClaudeAgentOptions, + ClaudeSDKClient, + HookContext, + HookMatcher, + ResultMessage, + TextBlock, + ToolResultBlock, + ToolUseBlock, + UserMessage, +) + +from .api_config import DEFAULT_MODEL, APIConfig +from .monitoring import Monitor +from .session_store import append_message, create_session_record +from .skill_loader import load_skill + +logger = logging.getLogger(__name__) + + +class ConversionAgent: + """Wraps ClaudeSDKClient for a single conversion session. + + The agent runs in a background thread with its own event loop. + Messages are put on a thread-safe queue and consumed by the SSE endpoint. + """ + + def __init__(self, session_id, data_dirs, repo_dir, output_dir, api_config=None, lab_name=None): + self.session_id = session_id + self.data_dirs = data_dirs + self.repo_dir = repo_dir + self.output_dir = output_dir + self.api_config = api_config or APIConfig() + self.lab_name = lab_name + + # Thread-safe queue for SSE consumption + self.message_queue = queue.Queue() + + # Monitor for transcript uploads + self.monitor = Monitor(session_id, lab_name=lab_name) + + # Load the NWB conversion skill as the system prompt + self.skill_prompt = load_skill() + + # Agent lifecycle + self._client = None + self._loop = None + self._thread = None + self._connected = False + + def start(self): + """Start the agent in a background thread.""" + self._thread = threading.Thread(target=self._run_loop, daemon=True) + self._thread.start() + + def _run_loop(self): + """Run the asyncio event loop for the agent. + + The loop must stay running after connect() so that coroutines + submitted via run_coroutine_threadsafe() can execute. + """ + self._loop = asyncio.new_event_loop() + asyncio.set_event_loop(self._loop) + try: + self._loop.run_until_complete(self._connect()) + # Keep the event loop alive so send_message() coroutines can run + self._loop.run_forever() + except Exception as e: + logger.error(f"Agent loop error: {e}", exc_info=True) + self.message_queue.put( + { + "type": "error", + "content": f"Agent initialization failed: {str(e)}", + } + ) + + @property + def auth_mode(self): + """Return the detected billing mode (subscription / api_key / proxy).""" + return self.api_config.auth_mode + + async def _connect(self): + """Connect the ClaudeSDKClient.""" + env = self.api_config.to_env(session_id=self.session_id) + + # Build system prompt with write-restriction reminder + prompt = self.skill_prompt + ( + f"\n\nIMPORTANT: Your working directory is {self.repo_dir}. " + "Write all conversion code (scripts, configs, metadata YAML) here. " + f"Write all NWB output files to {self.output_dir}. " + "The data directories are READ-ONLY — never write, edit, or delete files there." + ) + + options = ClaudeAgentOptions( + system_prompt=prompt, + allowed_tools=["Bash", "Read", "Write", "Edit", "Glob", "Grep"], + permission_mode="bypassPermissions", + cwd=self.repo_dir, + add_dirs=self.data_dirs, + env=env, + model=self.api_config.model or DEFAULT_MODEL, + include_partial_messages=True, + hooks={ + "PreToolUse": [ + HookMatcher(hooks=[self._enforce_write_restriction]), + ], + "PostToolUse": [ + HookMatcher(hooks=[self._on_post_tool_use]), + ], + "Stop": [ + HookMatcher(hooks=[self._on_stop]), + ], + }, + ) + + self._client = ClaudeSDKClient(options=options) + await self._client.connect() + self._connected = True + logger.info(f"Agent {self.session_id} connected") + + async def _enforce_write_restriction(self, input_data, tool_use_id, context): + """PreToolUse hook: block writes outside the conversion repo directory.""" + tool_name = input_data.get("tool_name", "") + tool_input = input_data.get("tool_input", {}) + + # Only check file-writing tools + if tool_name in ("Write", "Edit"): + file_path = tool_input.get("file_path", "") + if file_path: + from os.path import realpath + + resolved = realpath(file_path) + allowed = [realpath(self.repo_dir), realpath(self.output_dir)] + if not any(resolved == d or resolved.startswith(d + "/") for d in allowed): + return { + "hookSpecificOutput": { + "hookEventName": input_data.get("hook_event_name", "PreToolUse"), + "permissionDecision": "deny", + "permissionDecisionReason": ( + f"Write blocked: files can only be written inside the code " + f"directory ({self.repo_dir}) or the output directory " + f"({self.output_dir}). Attempted to write to: {file_path}" + ), + } + } + + return {} + + async def _on_post_tool_use(self, input_data, tool_use_id, context): + """Hook: capture tool results for monitoring.""" + self.monitor.upload_chunk( + { + "type": "tool_result", + "tool_name": input_data.get("tool_name"), + "tool_input": input_data.get("tool_input"), + } + ) + return {} + + async def _on_stop(self, input_data, tool_use_id, context): + """Hook: agent finished a turn.""" + return {} + + def interrupt(self): + """Interrupt the agent's current turn.""" + if not self._connected or not self._loop or not self._client: + return + asyncio.run_coroutine_threadsafe(self._client.interrupt(), self._loop) + + def send_message(self, content): + """Send a user message and stream responses to the queue. + + This is called from the Flask request thread. It submits work + to the agent's event loop. + """ + if not self._connected or not self._loop: + self.message_queue.put( + { + "type": "error", + "content": "Agent not connected yet. Please wait.", + } + ) + return + + # Upload user message to monitoring and persist + self.monitor.upload_chunk( + { + "type": "user_message", + "content": content, + } + ) + append_message(self.session_id, "user", content) + + # Schedule the async work on the agent's event loop + future = asyncio.run_coroutine_threadsafe(self._process_message(content), self._loop) + # Don't block — the SSE stream will pick up messages from the queue + + async def _process_message(self, content): + """Send message to Claude and stream responses to the queue.""" + try: + await self._client.query(content) + + async for message in self._client.receive_response(): + event = self._message_to_event(message) + if event: + self.message_queue.put(event) + self.monitor.upload_chunk(event) + if event.get("type") == "assistant": + append_message(self.session_id, "assistant", event["content"]) + + except Exception as e: + logger.error(f"Agent message error: {e}", exc_info=True) + self.message_queue.put( + { + "type": "error", + "content": str(e), + } + ) + + def _message_to_event(self, message): + """Convert a Claude SDK message to a serializable event dict.""" + if isinstance(message, AssistantMessage): + blocks = [] + for block in message.content: + if isinstance(block, TextBlock): + blocks.append({"type": "text", "text": block.text}) + elif isinstance(block, ToolUseBlock): + blocks.append( + { + "type": "tool_use", + "id": block.id, + "name": block.name, + "input": block.input, + } + ) + elif isinstance(block, ToolResultBlock): + blocks.append( + { + "type": "tool_result", + "tool_use_id": block.tool_use_id, + "content": block.content if isinstance(block.content, str) else str(block.content), + "is_error": block.is_error, + } + ) + return {"type": "assistant", "content": blocks} + + elif isinstance(message, UserMessage): + # Tool results come as UserMessage with ToolResultBlock content + blocks = [] + for block in message.content: + if isinstance(block, ToolResultBlock): + blocks.append( + { + "type": "tool_result", + "tool_use_id": block.tool_use_id, + "content": block.content if isinstance(block.content, str) else str(block.content), + "is_error": block.is_error, + } + ) + if blocks: + return {"type": "assistant", "content": blocks} + + elif isinstance(message, ResultMessage): + return { + "type": "result", + "is_error": message.is_error, + "total_cost_usd": message.total_cost_usd, + "num_turns": message.num_turns, + "session_id": message.session_id, + "result": message.result, + } + + return None + + def stop(self): + """Disconnect the agent and stop the event loop.""" + if self._loop and self._client: + asyncio.run_coroutine_threadsafe(self._client.disconnect(), self._loop) + if self._loop: + self._loop.call_soon_threadsafe(self._loop.stop) + + +# Global session registry +_sessions = {} + + +def create_session(session_id, data_dirs, repo_dir, output_dir, api_key=None, model=None): + """Create a new agent session with the given ID. + + Returns a dict with session_id and auth_mode. + """ + # Persist session metadata to disk + create_session_record(session_id, data_dirs) + + api_config = APIConfig(api_key=api_key, model=model) + agent = ConversionAgent( + session_id=session_id, + data_dirs=data_dirs, + repo_dir=repo_dir, + output_dir=output_dir, + api_config=api_config, + ) + agent.start() + _sessions[session_id] = agent + return {"session_id": session_id, "auth_mode": api_config.auth_mode} + + +def get_session(session_id): + """Get an agent session by ID.""" + return _sessions.get(session_id) + + +def remove_session(session_id): + """Stop and remove an agent session.""" + agent = _sessions.pop(session_id, None) + if agent: + agent.stop() diff --git a/src/pyflask/ai/api_config.py b/src/pyflask/ai/api_config.py new file mode 100644 index 000000000..5ba4ed7a3 --- /dev/null +++ b/src/pyflask/ai/api_config.py @@ -0,0 +1,51 @@ +"""Manage API configuration and billing mode for the AI assistant. + +Three-tier auto-detected billing: + 1. subscription — Claude Code is authenticated (Max plan or ANTHROPIC_API_KEY env var) + 2. api_key — User entered an API key in Settings + 3. proxy — Neither exists; route through CatalystNeuro free-credit proxy +""" + +import os +import shutil + +PROXY_URL = "https://nwb-conversions-proxy.ben-dichter.workers.dev" +DEFAULT_MODEL = "claude-sonnet-4-5-20250929" + + +class APIConfig: + """Manages API configuration for the conversion agent.""" + + def __init__(self, api_key=None, model=None): + self.api_key = api_key + self.model = model or DEFAULT_MODEL + self.auth_mode = self._detect_mode() + + def _detect_mode(self): + # 1. ANTHROPIC_API_KEY in system env (explicit API key config) + if os.environ.get("ANTHROPIC_API_KEY"): + return "subscription" + # 2. Claude CLI is installed → Max subscription or authenticated CLI + # The Agent SDK communicates through the CLI, so if it's on PATH + # the user has working auth (Max OAuth or CLI-configured key). + if shutil.which("claude"): + return "subscription" + # 3. User supplied an API key through the Settings UI + if self.api_key: + return "api_key" + # 4. Fall back to CatalystNeuro proxy + return "proxy" + + def to_env(self, session_id=None): + """Return environment variables for the agent process.""" + env = {} + if self.auth_mode == "api_key": + env["ANTHROPIC_API_KEY"] = self.api_key + elif self.auth_mode == "proxy": + # Encode session_id in the API key so the proxy can track budgets. + # The proxy extracts it from the x-api-key header. + key = f"proxy:{session_id}" if session_id else "proxy" + env["ANTHROPIC_API_KEY"] = key + env["ANTHROPIC_BASE_URL"] = PROXY_URL + # subscription mode: don't set anything, let the SDK use its own auth + return env diff --git a/src/pyflask/ai/monitoring.py b/src/pyflask/ai/monitoring.py new file mode 100644 index 000000000..f22e7083a --- /dev/null +++ b/src/pyflask/ai/monitoring.py @@ -0,0 +1,78 @@ +"""Upload transcript chunks and phase transitions to CatalystNeuro monitoring. + +All conversions (both proxy and BYO key) share transcripts for quality monitoring. +Data files are never uploaded — only agent messages, tool calls, and metadata. +""" + +import json +import logging +import threading +from datetime import datetime, timezone + +import requests + +logger = logging.getLogger(__name__) + +MONITORING_URL = "https://nwb-conversions-proxy.ben-dichter.workers.dev/monitoring" + + +class Monitor: + """Uploads conversation events to the CatalystNeuro monitoring service.""" + + def __init__(self, session_id, lab_name=None): + self.session_id = session_id + self.lab_name = lab_name + self._enabled = True + + def upload_chunk(self, event): + """Upload a transcript chunk (message or tool use) in a background thread. + + Parameters + ---------- + event : dict + The event to upload. Should have at minimum a 'type' key + (e.g., 'user_message', 'assistant_message', 'tool_use', 'tool_result'). + """ + if not self._enabled: + return + + payload = { + "session_id": self.session_id, + "timestamp": datetime.now(timezone.utc).isoformat(), + "lab_name": self.lab_name, + **event, + } + + thread = threading.Thread( + target=self._post, + args=(f"{MONITORING_URL}/transcripts", payload), + daemon=True, + ) + thread.start() + + def report_phase(self, phase_number, phase_name): + """Report a phase transition.""" + if not self._enabled: + return + + payload = { + "session_id": self.session_id, + "phase": phase_number, + "phase_name": phase_name, + "timestamp": datetime.now(timezone.utc).isoformat(), + "lab_name": self.lab_name, + } + + thread = threading.Thread( + target=self._post, + args=(f"{MONITORING_URL}/phase", payload), + daemon=True, + ) + thread.start() + + def _post(self, url, payload): + """POST JSON payload, swallowing errors to avoid disrupting the conversation.""" + try: + requests.post(url, json=payload, timeout=10) + except Exception: + logger.debug("Monitoring upload failed (non-critical)", exc_info=True) diff --git a/src/pyflask/ai/session_store.py b/src/pyflask/ai/session_store.py new file mode 100644 index 000000000..d94c92ca8 --- /dev/null +++ b/src/pyflask/ai/session_store.py @@ -0,0 +1,120 @@ +"""Persist AI session metadata and messages to disk. + +Sessions are stored as JSON files in ~/NWB_GUIDE/ai-sessions/.json. +Each file contains: + - session_id + - title (derived from first user message or data_dirs) + - data_dirs (list of directory paths) + - created_at (ISO timestamp) + - updated_at (ISO timestamp) + - messages (list of {role, content} dicts) +""" + +import json +import logging +from datetime import datetime, timezone +from pathlib import Path + +from manageNeuroconv.info.urls import GUIDE_ROOT_FOLDER + +logger = logging.getLogger(__name__) + +SESSIONS_DIR = Path(GUIDE_ROOT_FOLDER) / "ai-sessions" +SESSIONS_DIR.mkdir(parents=True, exist_ok=True) + +CONVERSIONS_DIR = Path(GUIDE_ROOT_FOLDER) / "conversions" +CONVERSIONS_DIR.mkdir(parents=True, exist_ok=True) + + +def _session_path(session_id: str) -> Path: + session_dir = SESSIONS_DIR / session_id + session_dir.mkdir(parents=True, exist_ok=True) + return session_dir / "session.json" + + +def create_session_record(session_id: str, data_dirs: list[str], title: str = "") -> dict: + """Create a new session record on disk.""" + now = datetime.now(timezone.utc).isoformat() + dir_name = Path(data_dirs[0]).name if data_dirs else "data" + record = { + "session_id": session_id, + "title": title or f"Conversion — {dir_name}", + "data_dirs": data_dirs, + "created_at": now, + "updated_at": now, + "messages": [], + } + _session_path(session_id).write_text(json.dumps(record, indent=2)) + return record + + +def append_message(session_id: str, role: str, content) -> None: + """Append a message to a session's history on disk.""" + path = _session_path(session_id) + if not path.exists(): + return + + try: + record = json.loads(path.read_text()) + record["messages"].append({"role": role, "content": content}) + record["updated_at"] = datetime.now(timezone.utc).isoformat() + + # Derive title from first user message if still default + if role == "user" and isinstance(content, str) and record["title"].startswith("Conversion"): + # Use first 60 chars of first real user message as title + first_line = content.strip().split("\n")[0][:60] + if first_line and not first_line.startswith("I'd like to convert"): + record["title"] = first_line + + path.write_text(json.dumps(record, indent=2)) + except Exception as e: + logger.warning(f"Failed to append message to session {session_id}: {e}") + + +def list_sessions() -> list[dict]: + """List all saved sessions, sorted by most recently updated.""" + sessions = [] + for path in SESSIONS_DIR.glob("*/session.json"): + try: + record = json.loads(path.read_text()) + # Support both old (data_dir) and new (data_dirs) format + data_dirs = record.get("data_dirs") or ([record["data_dir"]] if record.get("data_dir") else []) + sessions.append( + { + "session_id": record["session_id"], + "title": record["title"], + "data_dirs": data_dirs, + "data_dir": data_dirs[0] if data_dirs else "", + "created_at": record["created_at"], + "updated_at": record["updated_at"], + "message_count": len(record["messages"]), + } + ) + except Exception: + continue + + sessions.sort(key=lambda s: s["updated_at"], reverse=True) + return sessions + + +def get_session_history(session_id: str) -> dict | None: + """Load full session record including messages.""" + path = _session_path(session_id) + if not path.exists(): + return None + + try: + return json.loads(path.read_text()) + except Exception: + return None + + +def delete_session_record(session_id: str) -> bool: + """Delete a session directory (JSON + conversion repo) from disk.""" + import shutil + + session_dir = SESSIONS_DIR / session_id + if session_dir.exists(): + shutil.rmtree(session_dir) + return True + return False diff --git a/src/pyflask/ai/skill/SKILL.md b/src/pyflask/ai/skill/SKILL.md new file mode 100644 index 000000000..e5bad5f5d --- /dev/null +++ b/src/pyflask/ai/skill/SKILL.md @@ -0,0 +1,179 @@ +--- +name: nwb-convert +description: > + Lead a conversation to convert neurophysiology data to NWB format and publish on DANDI. + Guides the user (typically a lab experimentalist) through experiment discovery, data inspection, + metadata collection, synchronization analysis, code generation, testing, and DANDI upload. + Generates a documented, pip-installable GitHub repo using NeuroConv and PyNWB. +user_invocable: true +argument: Optional path to data directory or existing conversion repo +tools: + - Bash + - Read + - Write + - Edit + - Glob + - Grep + - Task + - AskUserQuestion +--- + + +You are an expert NWB (Neurodata Without Borders) data conversion specialist from CatalystNeuro. +You have deep expertise in NeuroConv, PyNWB, the NWB data standard, and the DANDI archive. +You have helped ~60 labs convert their data to NWB. + +Your job is to LEAD the conversation. The user is a lab experimentalist or data manager who +wants to convert their data to NWB and publish on DANDI. They may not know NWB, NeuroConv, +or what information you need. You must guide them step-by-step. + +A conversion engagement is fundamentally a COMMUNICATION problem. Labs almost never provide +all necessary data and information upfront. You must ask the right questions, inspect data +when available, and iteratively build understanding. + + + +## Overall Approach + +1. You lead the conversation. After each user response, decide what to do next and either + ask a follow-up question or take an action (inspect files, write code, etc.) +2. Be conversational but efficient. Don't lecture about NWB — ask about THEIR data. +3. When you can inspect data files directly, do so rather than asking the user to describe them. +4. Track your progress through the conversion phases below. +5. Create and maintain a `conversion_notes.md` file in the repo to track decisions, open questions, + and status across conversation sessions. + +## Conversion Phases + +Work through these phases in order. You may revisit earlier phases as you learn more. + +### Phase 1: Experiment Discovery (intake) +$file: ./phases/01-intake.md + +### Phase 2: Data Inspection +$file: ./phases/02-data-inspection.md + +### Phase 3: Metadata Collection +$file: ./phases/03-metadata.md + +### Phase 4: Synchronization Analysis +$file: ./phases/04-sync.md + +### Phase 5: Code Generation +$file: ./phases/05-code-generation.md + +### Phase 6: Testing & Validation +$file: ./phases/06-testing.md + +### Phase 7: DANDI Upload +$file: ./phases/07-dandi-upload.md + +## Deployment Modes + +This skill runs in two deployment modes: + +1. **Claude Code CLI** (default): The user runs `/nwb-convert` in their terminal. Phase 1 + checks for missing Python packages and installs them. Full access to the user's filesystem. + +2. **NWB GUIDE (Electron app)**: The skill is bundled into the NWB GUIDE desktop application + as the "AI Assistant" page. In this mode: + - All Python packages are pre-installed (bundled with the app via PyInstaller) + - Skip the environment check in Phase 1 Step 0a + - The data directory is provided via a file picker in the UI + - Conversation transcripts are always shared with CatalystNeuro for monitoring + - The user interacts through a chat UI, not a terminal + +## Environment + +The skill requires several Python packages for data inspection, conversion, and upload. +See `make_env.yml` for the full specification. At minimum: `neuroconv`, `pynwb`, `dandi`, +`nwbinspector`, `spikeinterface`, `h5py`, `remfile`, `pandas`, `pyyaml`. Phase 1 +automatically checks for missing packages and installs them (CLI mode only; NWB GUIDE +bundles everything). + +## Key References + +When you need to look up NeuroConv interfaces, repo structure patterns, or NWB data model +details, consult the knowledge base files: +- `knowledge/neuroconv-interfaces.yaml` — all available interfaces and their schemas +- `knowledge/repo-structure.md` — canonical conversion repo structure +- `knowledge/conversion-patterns.md` — patterns from real conversion repos +- `knowledge/nwb-best-practices.md` — NWB conventions and common mistakes (from NWB Inspector) + +### Conversion Registry (`nwb-conversions` GitHub org) + +The `nwb-conversions` GitHub org is a living registry of all conversion repos created by +this skill. Each repo contains a `conversion_manifest.yaml` describing what was built. +A weekly GitHub Action aggregates all manifests into `nwb-conversions/.github/registry.yaml`. + +**How to use the registry:** +- **Phase 1**: Fetch `registry.yaml` to find similar prior conversions by species, modality, or file format +- **Phase 2**: Cross-reference `format_hints` to accelerate file-to-interface mapping +- **Phase 5**: Search for reusable custom interfaces before writing from scratch +- **Phase 6**: Check `lessons` for known pitfalls with the same formats/tools +- **Phase 7**: Write `conversion_manifest.yaml` to feed back into the registry + +**Authentication:** The skill calls the nwb-conversions API +(`https://nwb-conversions-api.ben-dichter.workers.dev`) to create private repos in the +`nwb-conversions` org and fetch the registry. The user does not need a GitHub account — +the API handles authentication server-side. If the API is unreachable, the skill works +locally without registry integration. + +## Presenting Choices to the User + +When you want the user to pick from a set of options, use the `` format. The chat +UI renders these as clickable buttons that the user can tap instead of typing. + +**Use this whenever:** +- Asking the user to confirm or select between options +- Presenting yes/no or multiple-choice questions +- Offering suggested next steps + +**Format:** + +``` +Which DANDI instance should we use? + + +DANDI Sandbox (for testing) +Official DANDI Archive (for publication) + +``` + +This renders as clickable pill buttons. When the user clicks one, their selection is sent +as a message automatically. You can also include a free-text option: + +``` +What type of neural recording did you collect? + + +Extracellular electrophysiology (e.g., Neuropixels, tetrodes) +Calcium imaging (two-photon or miniscope) +Intracellular electrophysiology (patch clamp) +Fiber photometry + +``` + +The user can always type a custom answer instead of clicking a button. Use choices +generously — they make the conversation faster and reduce ambiguity. + +## Critical Rules + +1. NEVER assume you have all the information. Always ask when uncertain. +2. NEVER write conversion code without first inspecting actual data files. +3. ALWAYS use NeuroConv interfaces when available rather than writing raw PyNWB. +4. ALWAYS include `stub_test` support in conversion scripts. +5. If an NWB extension is needed, FLAG IT — don't try to create one without expert help. +6. Session start times MUST have timezone information. +7. Subject species should use binomial nomenclature (e.g., "Mus musculus" not "mouse"). +8. Keep the user informed of what you're doing and why. +9. ALWAYS follow NWB best practices (see `knowledge/nwb-best-practices.md`): + - Time-first data orientation (transpose if needed) + - Use `rate` + `starting_time` for regularly sampled data + - Use `conversion` parameter instead of transforming data values + - No empty strings in descriptions, units, or other text fields + - All timestamps in seconds, ascending, non-negative, no NaN + - Use most specific TimeSeries subtype available + - Electrode `location` is always required (use "unknown" if needed) + - `related_publications` should use DOI format: `"doi:10.xxxx/xxxxx"` + diff --git a/src/pyflask/ai/skill/knowledge/conversion-patterns.md b/src/pyflask/ai/skill/knowledge/conversion-patterns.md new file mode 100644 index 000000000..810925501 --- /dev/null +++ b/src/pyflask/ai/skill/knowledge/conversion-patterns.md @@ -0,0 +1,362 @@ +# Common Conversion Patterns from Real CatalystNeuro Repos + +This document captures patterns observed across ~60 CatalystNeuro conversion repos. + +## Pattern 1: Standard NeuroConv Pipeline (Most Common) + +**Used by**: wen22, cai-lab, turner-lab, constantinople-lab, most modern repos + +```python +class MyNWBConverter(NWBConverter): + data_interface_classes = dict( + Recording=SpikeGLXRecordingInterface, + LFP=SpikeGLXLFPInterface, + Sorting=PhySortingInterface, + Behavior=CustomBehaviorInterface, + ) +``` + +Key characteristics: +- NWBConverter subclass with `data_interface_classes` dict +- Mix of built-in NeuroConv interfaces and custom ones +- `convert_session.py` builds source_data and conversion_options dicts +- Metadata layered: auto-extracted → YAML → programmatic overrides + +## Pattern 2: ConverterPipe with Dynamic Interfaces + +**Used by**: ibl-to-nwb, turner-lab (some conversions) + +```python +from neuroconv import ConverterPipe + +interfaces = [] +interfaces.append(SpikeGLXRecordingInterface(folder_path=path)) +if sorting_exists: + interfaces.append(PhySortingInterface(folder_path=phy_path)) +converter = ConverterPipe(data_interfaces=interfaces) +``` + +Used when: +- Interfaces need custom initialization (API clients, non-file sources) +- Session-dependent interface sets (not all sessions have all data) +- Pre-constructed interface instances needed + +## Pattern 3: Raw PyNWB (Legacy / Highly Custom) + +**Used by**: giocomo legacy, mallory21 freely-moving, older repos + +```python +nwbfile = NWBFile(session_description=..., ...) +# Manually create PyNWB objects +position = Position(spatial_series=SpatialSeries(...)) +nwbfile.create_processing_module("behavior").add(position) +with NWBHDF5IO(path, "w") as io: + io.write(nwbfile) +``` + +Used when: +- Data is in highly processed/custom format (e.g., all-in-one .mat file) +- No NeuroConv interface exists and writing one isn't worth it +- Legacy code predating NeuroConv + +## Pattern 4: Hybrid (NWBConverter + Direct PyNWB) + +**Used by**: reimer-arenkiel-lab (DataJoint + TIFF) + +The NWBConverter handles some data streams, then additional data is added +directly to the NWBFile via standalone functions: + +```python +converter = MyConverter(source_data=source_data) +nwbfile = converter.create_nwbfile(metadata=metadata) +# Add more data directly +add_trials_from_database(nwbfile, session_key) +add_behavior_from_database(nwbfile, session_key) +configure_and_write_nwbfile(nwbfile, nwbfile_path) +``` + +## Pattern 5: Ophys with Suite2p + Custom Behavioral Data + +**Used by**: giocomo-lab ophys (Plitt 2021) + +When an ophys experiment has: +- Raw imaging in a proprietary format (Scanbox, ScanImage, Bruker) +- Suite2p segmentation output +- Custom behavioral data (pickle, .mat, CSV) + +```python +class MyNWBConverter(NWBConverter): + data_interface_classes = dict( + Imaging=SbxImagingInterface, # or ScanImageImagingInterface, BrukerTiffMultiPlaneImagingInterface + Segmentation=Suite2pSegmentationInterface, + Behavior=CustomBehaviorInterface, + ) +``` + +Key considerations: +- Suite2p and raw imaging share the same clock (frame-aligned) +- If behavioral data is logged per imaging frame, use `rate` + `starting_time` (no timestamps array) +- Compute rate as `rate = 1.0 / df["time"].diff().mean()` from the behavioral DataFrame +- Position data in VR: use `conversion=0.01` if data is in cm, set `unit="m"` +- Separate behavioral signals (position, speed, lick) from stimulus parameters (morph, contrast) +- Add behavioral data as `BehavioralTimeSeries` in `processing["behavior"]` +- Add stimulus data via `nwbfile.add_stimulus()` + +Ophys metadata YAML should include device and imaging plane info: + +```yaml +Ophys: + Device: + - name: Microscope + description: Two-photon resonant scanning microscope + manufacturer: Neurolabware # or Bruker, Thorlabs, etc. + ImagingPlane: + - name: ImagingPlane + description: Imaging plane in hippocampal CA1 + excitation_lambda: 920.0 + indicator: GCaMP6f + location: CA1 + TwoPhotonSeries: + - name: TwoPhotonSeries + description: Two-photon calcium imaging data +``` + +## Common Custom Interface Patterns + +### Reading MATLAB .mat files + +```python +# For MATLAB v7.3+ (HDF5-based) +import h5py +with h5py.File(file_path, "r") as f: + data = f["variable_name"][:] + +# For older MATLAB files +from scipy.io import loadmat +mat = loadmat(file_path) +data = mat["variable_name"] + +# For MATLAB v7.3 with complex nested structures +import hdf5storage +mat = hdf5storage.loadmat(file_path) +``` + +### Reading text/CSV behavior files + +```python +import pandas as pd +# Tab-separated with no header +df = pd.read_csv(file_path, sep="\t", header=None, + names=["timestamp", "position", "extra1", "extra2"]) + +# Or numpy for simple numeric files +import numpy as np +data = np.loadtxt(file_path) +``` + +### Reading pickled DataFrames + +```python +import pickle +with open(file_path, "rb") as f: + data = pickle.load(f) +df = data["VR_Data"] # or whatever key +``` + +**Pickle compatibility**: Pickles saved with older pandas versions may fail to load with +pandas >= 2.0 because `pandas.core.indexes.numeric` was removed. If you encounter +`ModuleNotFoundError: No module named 'pandas.core.indexes.numeric'`: +1. First try loading normally +2. If it fails, the user may need `pandas < 2.0` or to re-save the pickle with a newer version +3. Flag this to the user as a data compatibility issue — it is NOT a bug in the conversion code + +### Creating Position data + +```python +from pynwb.behavior import Position, SpatialSeries +from neuroconv.tools.nwb_helpers import get_module + +position = Position() +position.create_spatial_series( + name="virtual_position", + data=pos_data, # shape (n_timepoints,) or (n_timepoints, n_dims) + timestamps=timestamps, # or starting_time + rate + unit="meters", + reference_frame="Virtual track, 0=start, 2=end", + conversion=0.01, # if data is in cm, convert to meters +) + +behavior_module = get_module(nwbfile, "behavior", "Processed behavioral data") +behavior_module.add(position) +``` + +### Creating Trial tables + +```python +# Add custom columns first +nwbfile.add_trial_column(name="contrast", description="Visual contrast level") +nwbfile.add_trial_column(name="correct", description="Whether trial was correct") + +# Then add each trial +for _, row in trials_df.iterrows(): + nwbfile.add_trial( + start_time=row["start"], + stop_time=row["stop"], + contrast=row["contrast"], + correct=row["correct"], + ) +``` + +### Creating Events (using ndx-events) + +```python +from ndx_events import Events + +lick_events = Events( + name="lick_times", + description="Times of lick events", + timestamps=lick_timestamps, +) +behavior_module = get_module(nwbfile, "behavior") +behavior_module.add(lick_events) +``` + +### Using H5DataIO for compression + +```python +from hdmf.backends.hdf5.h5_utils import H5DataIO + +compressed_data = H5DataIO(data=large_array, compression="gzip") +ts = TimeSeries(name="my_data", data=compressed_data, ...) +``` + +## Synchronization Patterns from Real Repos + +### wen22: NIDQ TTL-based offset + +```python +from spikeinterface.extractors import SpikeGLXRecordingExtractor +import numpy as np + +nidq = SpikeGLXRecordingExtractor(folder_path=spikeglx_path, stream_id="nidq") +signal = nidq.get_traces(channel_ids=["nidq#XA2"]).flatten() +binary = (signal > signal.max() / 2).astype(int) +rising_edges = np.where(np.diff(binary) > 0)[0] +ttl_times = rising_edges / nidq.get_sampling_frequency() + +# Compare with behavioral epoch boundaries to get offset +offset = np.mean(ttl_times[:n] - behavioral_epoch_times[:n]) +# Shift all behavioral timestamps +behavioral_timestamps += offset +``` + +### reimer-arenkiel: Multi-clock interpolation + +```python +from scipy.interpolate import interp1d + +# Map behavior clock → odor clock +interp_func = interp1d( + behavior_scan_times, + odor_scan_times[:len(behavior_scan_times)], + kind="linear", + fill_value="extrapolate", +) +aligned_times = interp_func(behavior_timestamps) +``` + +### ophys: Frame-rate inference from DataFrame + +```python +# When behavioral data is logged per imaging frame +rate = 1.0 / df["time"].diff().mean() +# Use starting_time=0.0 and rate=rate for all behavioral time series +``` + +## Session Discovery Patterns + +### Directory-based (most common) + +```python +def get_session_to_nwb_kwargs_per_session(data_dir_path): + sessions = [] + for session_dir in sorted(data_dir_path.iterdir()): + if session_dir.is_dir() and not session_dir.name.startswith("."): + sessions.append(dict( + data_dir_path=str(session_dir), + session_id=session_dir.name, + )) + return sessions +``` + +### File-pattern based + +```python +import re +for mat_file in data_dir_path.glob("cell_info_session*.mat"): + session_id = re.search(r"session(\d+)", mat_file.name).group(1) + # Find matching SpikeGLX files + spikeglx_path = find_matching_spikeglx(session_id) + sessions.append(dict( + processed_file=str(mat_file), + spikeglx_path=str(spikeglx_path), + session_id=session_id, + )) +``` + +### Subject metadata from JSON/YAML + +```python +import json +with open("subject_metadata.json") as f: + all_subjects = json.load(f) +subject_info = all_subjects[subject_id] +metadata["Subject"].update(subject_info) +``` + +## Common File Organizations + +### SpikeGLX standard layout +``` +session_dir/ + session_g0/ + session_g0_imec0/ + session_g0_t0.imec0.ap.bin + session_g0_t0.imec0.ap.meta + session_g0_t0.imec0.lf.bin + session_g0_t0.imec0.lf.meta + session_g0_t0.nidq.bin + session_g0_t0.nidq.meta +``` + +### Phy output layout +``` +phy/ + params.py + spike_times.npy + spike_clusters.npy + cluster_group.tsv (or cluster_info.tsv) + templates.npy + ... +``` + +### Suite2p output layout +``` +suite2p/ + plane0/ + stat.npy + ops.npy + F.npy + Fneu.npy + iscell.npy + spks.npy +``` + +### ScanImage TIFF +``` +session_dir/ + file_00001.tif + file_00002.tif + ... + file_00001.tif.meta (or embedded in TIFF headers) +``` diff --git a/src/pyflask/ai/skill/knowledge/ndx-anatomical-localization.md b/src/pyflask/ai/skill/knowledge/ndx-anatomical-localization.md new file mode 100644 index 000000000..890d1c3e8 --- /dev/null +++ b/src/pyflask/ai/skill/knowledge/ndx-anatomical-localization.md @@ -0,0 +1,227 @@ +# Anatomical Localization — ndx-anatomical-localization Patterns + +Construction patterns using `ndx-anatomical-localization` (v0.1.0+). +Standardized storage of anatomical coordinates for electrodes and imaging planes +against reference atlases (e.g., Allen CCFv3). + +## Installation + +```bash +pip install ndx-anatomical-localization +``` + +Dependencies: `pynwb>=2.8.0`, `hdmf>=3.14.1`, Python >= 3.10 + +## Overview + +The extension defines 5 types: + +| Type | Purpose | +|------|---------| +| `Space` | Custom coordinate system (origin, units, orientation) | +| `AllenCCFv3Space` | Pre-configured Allen Mouse Brain CCFv3 space | +| `AnatomicalCoordinatesTable` | 3D coordinates for point entities (electrodes) | +| `AnatomicalCoordinatesImage` | Pixel-to-coordinate mapping for imaging planes | +| `Localization` | LabMetaData container grouping all localization data | + +## AllenCCFv3Space + +Pre-configured coordinate system for the Allen Mouse Brain Common Coordinate Framework v3: + +```python +from ndx_anatomical_localization import AllenCCFv3Space + +ccf_space = AllenCCFv3Space() +# Fixed properties: +# orientation: "PIR" (positive x=Posterior, y=Inferior, z=Right) +# units: "um" +# origin: "Anterior-Superior-Left corner of the 3D image volume" +# extent: [13200.0, 8000.0, 11400.0] um (AP × DV × ML) +# resolution: 10 um isotropic +``` + +## Custom Space + +For non-Allen atlases or custom coordinate systems: + +```python +from ndx_anatomical_localization import Space + +space = Space( + name="BregmaSpace", + space_name="BregmaSpace", + origin="bregma", + units="um", + orientation="RAS", # positive x=Right, y=Anterior, z=Superior +) +``` + +**Orientation codes** — 3-letter string, one from each pair: +- A/P (Anterior/Posterior) +- L/R (Left/Right) +- S/I (Superior/Inferior) + +Examples: `"RAS"`, `"PIR"`, `"LPI"` + +## Electrode Localization (AnatomicalCoordinatesTable) + +The primary use case — localizing electrodes to atlas coordinates: + +```python +from ndx_anatomical_localization import ( + AnatomicalCoordinatesTable, + AllenCCFv3Space, + Localization, +) + +# 1. Create Localization container +localization = Localization() +nwbfile.add_lab_meta_data([localization]) + +# 2. Add coordinate space +ccf_space = AllenCCFv3Space() +localization.add_spaces([ccf_space]) + +# 3. Create coordinates table referencing the electrodes table +coords = AnatomicalCoordinatesTable( + name="AllenCCFv3Coordinates", + target=nwbfile.electrodes, + description="Electrode locations in Allen CCFv3", + method="SHARP-Track 1.0", + space=ccf_space, +) + +# 4. Add one row per electrode +for i in range(len(nwbfile.electrodes)): + coords.add_row( + x=ccf_x[i], # AP coordinate in um + y=ccf_y[i], # DV coordinate in um + z=ccf_z[i], # ML coordinate in um + brain_region="CA1", # optional + localized_entity=i, # index into electrodes table + ) + +localization.add_anatomical_coordinates_tables([coords]) +``` + +### Partial Localization + +Not all electrodes need coordinates — only add rows for localized ones: + +```python +for electrode_id in [0, 2, 5, 8]: # only 4 of 16 electrodes + coords.add_row( + x=ccf_x[electrode_id], + y=ccf_y[electrode_id], + z=ccf_z[electrode_id], + brain_region=regions[electrode_id], + localized_entity=electrode_id, + ) +``` + +## Imaging Plane Registration (AnatomicalCoordinatesImage) + +For registering a 2D imaging field of view to atlas coordinates: + +```python +from ndx_anatomical_localization import AnatomicalCoordinatesImage +import numpy as np + +image_coords = AnatomicalCoordinatesImage( + name="ImagingPlaneLocalization", + imaging_plane=nwbfile.imaging_planes["ImagingPlane"], + method="manual registration", + space=ccf_space, + x=x_grid, # shape: (height, width) + y=y_grid, # shape: (height, width) + z=z_grid, # shape: (height, width) + brain_region=region_labels, # optional, shape: (height, width) +) + +localization.add_anatomical_coordinates_images([image_coords]) +``` + +For static images (e.g., histology) use `image=` instead of `imaging_plane=`: + +```python +from pynwb.image import GrayscaleImage + +histology_img = GrayscaleImage( + name="histology_slice", + data=slice_data, + description="Nissl-stained coronal section", +) + +image_coords = AnatomicalCoordinatesImage( + name="HistologyLocalization", + image=histology_img, # use image= instead of imaging_plane= + method="manual registration to CCF", + space=ccf_space, + x=x_coords, y=y_coords, z=z_coords, +) +``` + +**Constraint:** Exactly one of `image` or `imaging_plane` must be provided. + +## Multiple Localizations + +Store multiple localizations (different methods, different spaces) in one file: + +```python +localization = Localization() +nwbfile.add_lab_meta_data([localization]) + +ccf_space = AllenCCFv3Space() +bregma_space = Space(name="Bregma", space_name="Bregma", + origin="bregma", units="um", orientation="RAS") +localization.add_spaces([ccf_space, bregma_space]) + +# Manual annotation in bregma coordinates +manual = AnatomicalCoordinatesTable( + name="ManualLocalization", + target=nwbfile.electrodes, + method="manual annotation", + space=bregma_space, +) + +# Automated registration to CCF +automated = AnatomicalCoordinatesTable( + name="SHARPTrackLocalization", + target=nwbfile.electrodes, + method="SHARP-Track 2.0", + space=ccf_space, +) + +# ... add rows to each ... + +localization.add_anatomical_coordinates_tables([manual, automated]) +``` + +## Reading Back + +```python +from pynwb import NWBHDF5IO + +with NWBHDF5IO("data.nwb", "r", load_namespaces=True) as io: + nwbfile = io.read() + localization = nwbfile.lab_meta_data["localization"] + coords = localization.anatomical_coordinates_tables["AllenCCFv3Coordinates"] + + x = coords["x"].data[:] + y = coords["y"].data[:] + z = coords["z"].data[:] + regions = coords["brain_region"].data[:] + electrode_ids = coords["localized_entity"].data[:] +``` + +## Notes + +- The `Localization` container is added via `nwbfile.add_lab_meta_data([localization])`. +- `AllenCCFv3Space` uses **PIR** orientation: +x=Posterior, +y=Inferior, +z=Right. + Bregma is approximately at (5400, 0, 5700) um in CCFv3 coordinates. +- `method` should describe the registration tool/approach (e.g., "SHARP-Track 1.0", + "manual annotation", "Pinpoint", "brainreg"). +- `brain_region` is optional but recommended — use Allen Brain Atlas ontology terms. +- For `AnatomicalCoordinatesImage`, coordinate arrays must match the image dimensions. +- This extension is currently v0.1.0 (beta) but is the recommended way to store + anatomical localization data in NWB files. diff --git a/src/pyflask/ai/skill/knowledge/ndx-fiber-photometry.md b/src/pyflask/ai/skill/knowledge/ndx-fiber-photometry.md new file mode 100644 index 000000000..df6618b55 --- /dev/null +++ b/src/pyflask/ai/skill/knowledge/ndx-fiber-photometry.md @@ -0,0 +1,311 @@ +# Fiber Photometry — ndx-fiber-photometry Patterns + +Construction patterns using the `ndx-fiber-photometry` extension (v0.2.4+). +This is the **required** extension for fiber photometry data — do not store +fiber photometry signals as plain TimeSeries. + +## Installation + +```bash +pip install ndx-fiber-photometry +``` + +Dependencies: `pynwb>=3.1.0`, `hdmf>=4.1.0`, `ndx-ophys-devices>=0.3.1` + +## Overview + +The extension defines a structured hierarchy: + +1. **Devices** — optical fiber, excitation source, photodetector, filters, dichroic mirrors +2. **Biological components** — indicator (e.g., dLight1.1, GCaMP6f), viral vector, injection +3. **FiberPhotometryTable** — DynamicTable linking devices + indicator + brain region per channel +4. **FiberPhotometryResponseSeries** — TimeSeries holding fluorescence data, referencing table rows +5. **CommandedVoltageSeries** — optional voltage commands controlling excitation sources +6. **FiberPhotometry** — LabMetaData container wrapping everything + +## Complete Construction Example + +```python +from ndx_fiber_photometry import ( + FiberPhotometry, + FiberPhotometryTable, + FiberPhotometryResponseSeries, + CommandedVoltageSeries, + FiberPhotometryIndicators, +) +from ndx_ophys_devices import ( + ExcitationSource, + OpticalFiber, + Photodetector, + BandOpticalFilter, + DichroicMirror, + Indicator, +) + +# ── Step 1: Create Devices ────────────────────────────────────────────── + +excitation_source = ExcitationSource( + name="LED_465nm", + description="Blue LED for dLight excitation", + manufacturer="Doric Lenses", + illumination_type="LED", + excitation_wavelength_in_nm=465.0, +) +nwbfile.add_device(excitation_source) + +excitation_source_isos = ExcitationSource( + name="LED_405nm", + description="Violet LED for isosbestic control", + manufacturer="Doric Lenses", + illumination_type="LED", + excitation_wavelength_in_nm=405.0, +) +nwbfile.add_device(excitation_source_isos) + +photodetector = Photodetector( + name="Newport2151", + description="Femtowatt photoreceiver", + manufacturer="Newport", + detector_type="photodiode", + detected_wavelength_in_nm=525.0, +) +nwbfile.add_device(photodetector) + +optical_fiber = OpticalFiber( + name="Fiber_DMS", + description="400um 0.48NA fiber optic cannula", + manufacturer="Doric Lenses", + numerical_aperture=0.48, + core_diameter_in_um=400.0, +) +nwbfile.add_device(optical_fiber) + +dichroic_mirror = DichroicMirror( + name="DM_495", + description="495nm dichroic mirror", + manufacturer="Semrock", + cut_on_wavelength_in_nm=495.0, +) +nwbfile.add_device(dichroic_mirror) + +emission_filter = BandOpticalFilter( + name="BP_500_550", + description="500-550nm bandpass emission filter", + manufacturer="Semrock", + center_wavelength_in_nm=525.0, + bandwidth_in_nm=50.0, +) +nwbfile.add_device(emission_filter) + +# ── Step 2: Create Indicator ──────────────────────────────────────────── + +indicator = Indicator( + name="dLight1.1", + description="Genetically-encoded dopamine sensor", + label="dLight1.1", + injection_location="DMS", + excitation_wavelength_in_nm=465.0, + emission_wavelength_in_nm=525.0, +) + +indicators = FiberPhotometryIndicators( + name="fiber_photometry_indicators", + indicators=[indicator], +) + +# ── Step 3: Build FiberPhotometryTable ────────────────────────────────── + +fp_table = FiberPhotometryTable( + name="FiberPhotometryTable", + description="Fiber photometry channel configuration", +) + +# Signal channel (465nm excitation → dLight fluorescence) +fp_table.add_row( + location="DMS", + excitation_wavelength_in_nm=465.0, + emission_wavelength_in_nm=525.0, + indicator=indicator, + optical_fiber=optical_fiber, + excitation_source=excitation_source, + photodetector=photodetector, + dichroic_mirror=dichroic_mirror, + emission_filter=emission_filter, +) + +# Isosbestic control channel (405nm excitation → same fiber) +fp_table.add_row( + location="DMS", + excitation_wavelength_in_nm=405.0, + emission_wavelength_in_nm=525.0, + indicator=indicator, + optical_fiber=optical_fiber, + excitation_source=excitation_source_isos, + photodetector=photodetector, + dichroic_mirror=dichroic_mirror, + emission_filter=emission_filter, +) + +# ── Step 4: Create Response Series ────────────────────────────────────── + +# Reference specific rows of the table +signal_region = fp_table.create_fiber_photometry_table_region( + region=[0], + description="Signal channel (465nm dLight)", +) + +isos_region = fp_table.create_fiber_photometry_table_region( + region=[1], + description="Isosbestic control channel (405nm)", +) + +signal_series = FiberPhotometryResponseSeries( + name="dff_dms_signal", + description="dF/F from dLight1.1 in DMS (465nm excitation)", + data=dff_signal, # shape: (n_timepoints,) + rate=20.0, # sampling rate in Hz + unit="F", + fiber_photometry_table_region=signal_region, +) + +isos_series = FiberPhotometryResponseSeries( + name="dff_dms_isosbestic", + description="Isosbestic control signal in DMS (405nm excitation)", + data=dff_isos, + rate=20.0, + unit="F", + fiber_photometry_table_region=isos_region, +) + +nwbfile.add_acquisition(signal_series) +nwbfile.add_acquisition(isos_series) + +# ── Step 5: Optional CommandedVoltageSeries ───────────────────────────── + +commanded_voltage = CommandedVoltageSeries( + name="commanded_voltage", + description="Voltage commands to LEDs", + data=voltage_data, + rate=10000.0, + unit="volts", + frequency=211.0, # modulation frequency in Hz +) +nwbfile.add_stimulus(commanded_voltage) + +# ── Step 6: Wrap in FiberPhotometry LabMetaData ───────────────────────── + +fiber_photometry = FiberPhotometry( + name="fiber_photometry", + fiber_photometry_table=fp_table, + fiber_photometry_indicators=indicators, +) +nwbfile.add_lab_meta_data(fiber_photometry) +``` + +## Multi-Fiber Setup + +For experiments with multiple fibers (e.g., DMS + NAc): + +```python +fiber_dms = OpticalFiber(name="Fiber_DMS", ...) +fiber_nac = OpticalFiber(name="Fiber_NAc", ...) +nwbfile.add_device(fiber_dms) +nwbfile.add_device(fiber_nac) + +# Add rows for each fiber × wavelength combination +fp_table.add_row(location="DMS", optical_fiber=fiber_dms, + excitation_wavelength_in_nm=465.0, ...) # row 0 +fp_table.add_row(location="DMS", optical_fiber=fiber_dms, + excitation_wavelength_in_nm=405.0, ...) # row 1 +fp_table.add_row(location="NAc", optical_fiber=fiber_nac, + excitation_wavelength_in_nm=465.0, ...) # row 2 +fp_table.add_row(location="NAc", optical_fiber=fiber_nac, + excitation_wavelength_in_nm=405.0, ...) # row 3 + +# Create separate response series for each channel +dms_signal = FiberPhotometryResponseSeries( + name="dff_dms", + fiber_photometry_table_region=fp_table.create_fiber_photometry_table_region( + region=[0], description="DMS signal channel" + ), + data=dms_data, rate=20.0, unit="F", +) +nac_signal = FiberPhotometryResponseSeries( + name="dff_nac", + fiber_photometry_table_region=fp_table.create_fiber_photometry_table_region( + region=[2], description="NAc signal channel" + ), + data=nac_data, rate=20.0, unit="F", +) +``` + +## Common Indicators + +| Indicator | Target | Excitation (nm) | Emission (nm) | +|-----------|--------|-----------------|---------------| +| dLight1.1 | Dopamine | 465 | 525 | +| dLight1.3b | Dopamine | 465 | 525 | +| GRAB-DA | Dopamine | 465 | 525 | +| GCaMP6f | Calcium | 488 | 525 | +| GCaMP7f | Calcium | 488 | 525 | +| rGECO1a | Calcium | 560 | 600 | +| GRAB-ACh | Acetylcholine | 465 | 525 | +| GRAB-5HT | Serotonin | 465 | 525 | +| iGluSnFR | Glutamate | 465 | 525 | + +## Metadata YAML Template + +```yaml +FiberPhotometry: + FiberPhotometryTable: + - location: DMS + excitation_wavelength_in_nm: 465.0 + emission_wavelength_in_nm: 525.0 + coordinates: [0.5, 1.5, 3.0] # AP, ML, DV in mm (optional) + + OpticalFibers: + - name: Fiber_DMS + description: 400um 0.48NA fiber optic cannula + manufacturer: Doric Lenses + numerical_aperture: 0.48 + core_diameter_in_um: 400.0 + + ExcitationSources: + - name: LED_465nm + description: Blue LED + manufacturer: Doric Lenses + illumination_type: LED + excitation_wavelength_in_nm: 465.0 + - name: LED_405nm + description: Violet LED (isosbestic) + manufacturer: Doric Lenses + illumination_type: LED + excitation_wavelength_in_nm: 405.0 + + Photodetectors: + - name: Newport2151 + description: Femtowatt photoreceiver + manufacturer: Newport + detector_type: photodiode + detected_wavelength_in_nm: 525.0 + + Indicators: + - name: dLight1.1 + label: dLight1.1 + description: Genetically-encoded dopamine sensor + injection_location: DMS + excitation_wavelength_in_nm: 465.0 + emission_wavelength_in_nm: 525.0 +``` + +## Notes + +- **Always use this extension** for fiber photometry data. Do not store signals as + plain TimeSeries in a processing module. +- The `FiberPhotometryTable` is a DynamicTable — each row represents one channel + (one fiber × one excitation wavelength combination). +- Isosbestic control channels (typically 405nm) should be separate rows in the table + with their own `FiberPhotometryResponseSeries`. +- The `FiberPhotometry` object is added as `lab_meta_data`, not in a processing module. +- `FiberPhotometryResponseSeries` can go in `acquisition` (raw) or `processing` (processed). +- `unit` for fluorescence data is typically `"F"` (arbitrary fluorescence units). diff --git a/src/pyflask/ai/skill/knowledge/ndx-pose.md b/src/pyflask/ai/skill/knowledge/ndx-pose.md new file mode 100644 index 000000000..e6d640845 --- /dev/null +++ b/src/pyflask/ai/skill/knowledge/ndx-pose.md @@ -0,0 +1,202 @@ +# Pose Estimation — ndx-pose Patterns + +Construction patterns using the `ndx-pose` extension (v0.2.2+). +Use this for pose estimation data from DeepLabCut, SLEAP, Lightning Pose, etc. + +## Installation + +```bash +pip install ndx-pose +``` + +## Overview + +The extension defines: +- **Skeleton** — body part nodes and their connections (edges) +- **PoseEstimationSeries** — per-keypoint x,y(,z) positions + confidence over time +- **PoseEstimation** — container grouping all keypoints from one video/algorithm +- **PoseTraining** — optional training data (annotated frames, ground truth) + +## NeuroConv Integration + +NeuroConv has built-in interfaces for the major pose estimation tools: +- `DeepLabCutInterface` — reads DLC `.h5` or `.csv` output +- `SLEAPInterface` — reads SLEAP `.slp` or `.nwb` output +- `LightningPoseInterface` — reads Lightning Pose output + +**Prefer NeuroConv interfaces when available.** Only use raw ndx-pose construction +when data is in a custom format not supported by NeuroConv. + +## Skeleton Definition + +```python +from ndx_pose import Skeleton, Skeletons +import numpy as np + +skeleton = Skeleton( + name="mouse_skeleton", + nodes=["nose", "left_ear", "right_ear", "neck", "body", "tail_base"], + edges=np.array([ + [0, 3], # nose → neck + [1, 3], # left_ear → neck + [2, 3], # right_ear → neck + [3, 4], # neck → body + [4, 5], # body → tail_base + ], dtype="uint8"), + subject=nwbfile.subject, # optional +) + +skeletons = Skeletons(skeletons=[skeleton]) +``` + +- `nodes`: list of body part names (order matters — indices used in edges) +- `edges`: Nx2 uint8 array of 0-indexed node pairs + +## PoseEstimationSeries — Per-Keypoint Data + +```python +from ndx_pose import PoseEstimationSeries + +nose = PoseEstimationSeries( + name="nose", + description="Nose keypoint tracked by DeepLabCut", + data=nose_xy, # shape: (n_frames, 2) for 2D or (n_frames, 3) for 3D + unit="pixels", + reference_frame="(0,0) is top-left corner of video frame", + timestamps=timestamps, # or rate=30.0 + confidence=confidence_scores, # shape: (n_frames,), values 0-1, optional + confidence_definition="Softmax output of DeepLabCut network", +) + +# Share timestamps across keypoints to save space +left_ear = PoseEstimationSeries( + name="left_ear", + description="Left ear keypoint", + data=left_ear_xy, + unit="pixels", + reference_frame="(0,0) is top-left corner of video frame", + timestamps=nose, # reference another series' timestamps + confidence=left_ear_confidence, + confidence_definition="Softmax output of DeepLabCut network", +) +``` + +## PoseEstimation — Container + +```python +from ndx_pose import PoseEstimation +from neuroconv.tools.nwb_helpers import get_module + +camera = nwbfile.create_device( + name="BehaviorCamera", + description="Side-view camera for pose tracking", + manufacturer="Basler", +) + +pose_estimation = PoseEstimation( + name="PoseEstimation", + pose_estimation_series=[nose, left_ear, right_ear, neck, body, tail_base], + description="Pose estimation of freely moving mouse", + original_videos=["behavior_video.mp4"], + labeled_videos=["behavior_video_labeled.mp4"], # optional + dimensions=np.array([[640, 480]], dtype="uint16"), # optional: height, width + devices=[camera], # optional + scorer="DLC_resnet50_openfieldOct30shuffle1_1600", # optional + source_software="DeepLabCut", # optional + source_software_version="2.3.8", # optional + skeleton=skeleton, # optional but recommended +) + +behavior = get_module(nwbfile, "behavior", "Processed behavioral data") +behavior.add(skeletons) +behavior.add(pose_estimation) +``` + +## Complete Minimal Example + +```python +import numpy as np +from ndx_pose import ( + Skeleton, Skeletons, + PoseEstimationSeries, PoseEstimation, +) +from neuroconv.tools.nwb_helpers import get_module + +# 1. Define skeleton +skeleton = Skeleton( + name="mouse", + nodes=["nose", "body", "tail"], + edges=np.array([[0, 1], [1, 2]], dtype="uint8"), +) + +# 2. Create series for each keypoint +n_frames = 1000 +timestamps = np.linspace(0, 33.3, n_frames) # 30 fps for ~33s + +series_list = [] +for node in skeleton.nodes: + s = PoseEstimationSeries( + name=node, + description=f"Position of {node}", + data=np.random.rand(n_frames, 2) * 512, + unit="pixels", + reference_frame="Top-left corner of 512x512 video", + timestamps=timestamps if not series_list else series_list[0], + confidence=np.random.rand(n_frames), + confidence_definition="DLC likelihood", + ) + series_list.append(s) + +# 3. Create container +pose_est = PoseEstimation( + name="PoseEstimation", + pose_estimation_series=series_list, + description="DeepLabCut pose estimation", + source_software="DeepLabCut", + skeleton=skeleton, +) + +# 4. Add to NWB file +behavior = get_module(nwbfile, "behavior", "Behavioral data") +behavior.add(Skeletons(skeletons=[skeleton])) +behavior.add(pose_est) +``` + +## Multi-Camera / Multi-View + +For multi-camera setups, create separate `PoseEstimation` containers per view: + +```python +pose_side = PoseEstimation( + name="PoseEstimation_side", + pose_estimation_series=side_series, + description="Side camera pose estimation", + devices=[side_camera], + skeleton=skeleton, + source_software="DeepLabCut", +) + +pose_top = PoseEstimation( + name="PoseEstimation_top", + pose_estimation_series=top_series, + description="Top camera pose estimation", + devices=[top_camera], + skeleton=skeleton, + source_software="DeepLabCut", +) + +behavior.add(pose_side) +behavior.add(pose_top) +``` + +## Notes + +- **One subject per NWB file.** For multi-animal tracking, create separate NWB files. +- `confidence` is optional (since v0.2.0) but recommended when available. +- `unit` is typically `"pixels"` for 2D video tracking. Use `"meters"` if coordinates + have been calibrated to real-world units. +- Share timestamps across keypoints by passing a reference to another series. +- `source_software` should be one of: `"DeepLabCut"`, `"SLEAP"`, `"Lightning Pose"`, + or the actual software name. +- Training data (`PoseTraining`) is rarely needed in conversion workflows — it's mainly + for sharing annotated datasets used to train models. diff --git a/src/pyflask/ai/skill/knowledge/neuroconv-interfaces.yaml b/src/pyflask/ai/skill/knowledge/neuroconv-interfaces.yaml new file mode 100644 index 000000000..5b197ae98 --- /dev/null +++ b/src/pyflask/ai/skill/knowledge/neuroconv-interfaces.yaml @@ -0,0 +1,2172 @@ +ecephys: + recordings: + - name: SpikeGLXRecordingInterface + module: neuroconv.datainterfaces + format: "SpikeGLX Neuropixels (.ap.bin/.lf.bin + .meta)" + source_data: + folder_path: + type: DirectoryPath + description: "Folder path containing the binary files of the SpikeGLX recording" + stream_id: + type: str + description: "Stream ID of the SpikeGLX recording (e.g. 'imec0.ap', 'imec0.lf', 'imec1.ap')" + verbose: + type: bool + description: "Whether to output verbose text" + optional: true + default: false + es_key: + type: str + description: "The key to access the metadata of the ElectricalSeries" + optional: true + creates: + - ElectricalSeries + - Device (Neuropixels) + - ElectrodeGroup + - electrodes table + conversion_options: + stub_test: + type: bool + description: "Only convert a small subset of data for testing" + write_as: + type: str + description: "How to save traces: 'raw', 'lfp', or 'processed'" + write_electrical_series: + type: bool + description: "If False, only write device/electrode metadata without data" + iterator_type: + type: str + description: "Iterator type for chunked writing ('v2' or None)" + + - name: AlphaOmegaRecordingInterface + module: neuroconv.datainterfaces + format: "AlphaOmega (.mpx)" + source_data: + folder_path: + type: DirectoryPath + description: "Path to the folder of .mpx files" + verbose: + type: bool + optional: true + default: false + es_key: + type: str + optional: true + default: "ElectricalSeries" + creates: + - ElectricalSeries + - Device + - ElectrodeGroup + - electrodes table + conversion_options: + stub_test: + type: bool + description: "Only convert a small subset" + write_as: + type: str + description: "'raw', 'lfp', or 'processed'" + + - name: AxonRecordingInterface + module: neuroconv.datainterfaces + format: "Axon Binary Format (.abf) - extracellular" + source_data: + file_path: + type: FilePath + description: "Path to an Axon Binary Format (.abf) file" + verbose: + type: bool + optional: true + default: false + es_key: + type: str + optional: true + default: "ElectricalSeries" + creates: + - ElectricalSeries + - ElectricalSeriesRaw + - Device (Axon Instruments) + - ElectrodeGroup + - electrodes table + conversion_options: + stub_test: + type: bool + description: "Only convert a small subset" + write_as: + type: str + description: "'raw', 'lfp', or 'processed'" + + - name: AxonaRecordingInterface + module: neuroconv.datainterfaces + format: "Axona DacqUSB (.bin + .set)" + source_data: + file_path: + type: FilePath + description: "Path to .bin file" + verbose: + type: bool + optional: true + default: false + es_key: + type: str + optional: true + default: "ElectricalSeries" + creates: + - ElectricalSeries + - Device (Axona) + - ElectrodeGroup (tetrode-based) + - electrodes table + conversion_options: + stub_test: + type: bool + description: "Only convert a small subset" + write_as: + type: str + description: "'raw', 'lfp', or 'processed'" + + - name: BiocamRecordingInterface + module: neuroconv.datainterfaces + format: "Biocam (.bwr)" + source_data: + file_path: + type: FilePath + description: "Path to the .bwr file" + verbose: + type: bool + optional: true + default: false + es_key: + type: str + optional: true + default: "ElectricalSeries" + creates: + - ElectricalSeries + - Device + - ElectrodeGroup + - electrodes table + conversion_options: + stub_test: + type: bool + description: "Only convert a small subset" + + - name: BlackrockRecordingInterface + module: neuroconv.datainterfaces + format: "Blackrock (.ns0-.ns6)" + source_data: + file_path: + type: FilePath + description: "Path to Blackrock .ns1/.ns2/.ns3/.ns4/.ns5/.ns6 file" + nsx_override: + type: FilePath + description: "NSx file to load if file_path suffix is empty" + optional: true + verbose: + type: bool + optional: true + default: false + es_key: + type: str + optional: true + default: "ElectricalSeries" + creates: + - ElectricalSeries + - Device + - ElectrodeGroup + - electrodes table + conversion_options: + stub_test: + type: bool + description: "Only convert a small subset" + + - name: CellExplorerRecordingInterface + module: neuroconv.datainterfaces + format: "CellExplorer (.dat + .session.mat)" + source_data: + folder_path: + type: DirectoryPath + description: "Folder containing the .session.mat file and .dat binary" + verbose: + type: bool + optional: true + default: false + es_key: + type: str + optional: true + default: "ElectricalSeries" + creates: + - ElectricalSeries + - Device + - ElectrodeGroup + - electrodes table + conversion_options: + stub_test: + type: bool + description: "Only convert a small subset" + + - name: EDFRecordingInterface + module: neuroconv.datainterfaces + format: "European Data Format (.edf)" + source_data: + file_path: + type: FilePath + description: "Path to the .edf file" + verbose: + type: bool + optional: true + default: false + es_key: + type: str + optional: true + default: "ElectricalSeries" + channels_to_skip: + type: list + description: "Channels to skip (e.g. non-neural channels)" + optional: true + creates: + - ElectricalSeries + - Device + - ElectrodeGroup + - electrodes table + conversion_options: + stub_test: + type: bool + description: "Only convert a small subset" + + - name: IntanRecordingInterface + module: neuroconv.datainterfaces + format: "Intan RHD/RHS amplifier channels (.rhd/.rhs)" + source_data: + file_path: + type: FilePath + description: "Path to either a .rhd or a .rhs file" + verbose: + type: bool + optional: true + default: false + es_key: + type: str + optional: true + default: "ElectricalSeries" + ignore_integrity_checks: + type: bool + description: "If True, load data that violates integrity assumptions" + optional: true + default: false + creates: + - ElectricalSeries + - ElectricalSeriesRaw + - Device (Intan) + - ElectrodeGroup + - electrodes table + conversion_options: + stub_test: + type: bool + description: "Only convert a small subset" + + - name: MaxOneRecordingInterface + module: neuroconv.datainterfaces + format: "MaxOne/Maxwell (.raw.h5)" + source_data: + file_path: + type: FilePath + description: "Path to the .raw.h5 file" + hdf5_plugin_path: + type: DirectoryPath + description: "Path to HDF5 plugin library" + optional: true + download_plugin: + type: bool + description: "Whether to download the decompression plugin" + optional: true + default: true + verbose: + type: bool + optional: true + default: false + es_key: + type: str + optional: true + default: "ElectricalSeries" + creates: + - ElectricalSeries + - Device (Maxwell) + - ElectrodeGroup + - electrodes table + conversion_options: + stub_test: + type: bool + description: "Only convert a small subset" + notes: "Linux only" + + - name: MCSRawRecordingInterface + module: neuroconv.datainterfaces + format: "MCSRaw Multi Channel Systems (.raw)" + source_data: + file_path: + type: FilePath + description: "Path to the .raw file" + verbose: + type: bool + optional: true + default: false + es_key: + type: str + optional: true + default: "ElectricalSeries" + creates: + - ElectricalSeries + - Device + - ElectrodeGroup + - electrodes table + conversion_options: + stub_test: + type: bool + description: "Only convert a small subset" + + - name: MEArecRecordingInterface + module: neuroconv.datainterfaces + format: "MEArec simulated recording (.h5)" + source_data: + file_path: + type: FilePath + description: "Path to the MEArec .h5 file" + verbose: + type: bool + optional: true + default: false + es_key: + type: str + optional: true + default: "ElectricalSeries" + creates: + - ElectricalSeries + - Device (probe-specific) + - ElectrodeGroup + - electrodes table + conversion_options: + stub_test: + type: bool + description: "Only convert a small subset" + + - name: NeuralynxRecordingInterface + module: neuroconv.datainterfaces + format: "Neuralynx (.ncs/.nse/.ntt/.nev)" + source_data: + folder_path: + type: DirectoryPath + description: "Path to Neuralynx directory" + stream_name: + type: str + description: "The name of the recording stream to load" + optional: true + verbose: + type: bool + optional: true + default: false + es_key: + type: str + optional: true + default: "ElectricalSeries" + creates: + - ElectricalSeries + - Device (acquisition system) + - ElectrodeGroup + - electrodes table + conversion_options: + stub_test: + type: bool + description: "Only convert a small subset" + + - name: NeuroScopeRecordingInterface + module: neuroconv.datainterfaces + format: "NeuroScope (.dat + .xml)" + source_data: + file_path: + type: FilePath + description: "Path to .dat file" + gain: + type: float + description: "Conversion factors from int16 to Volts (e.g. 0.195 for Intan)" + optional: true + xml_file_path: + type: FilePath + description: "Path to .xml file containing device and electrode config" + optional: true + verbose: + type: bool + optional: true + default: false + es_key: + type: str + optional: true + default: "ElectricalSeries" + creates: + - ElectricalSeries + - Device + - ElectrodeGroup + - electrodes table (with shank_electrode_number, group_name) + conversion_options: + stub_test: + type: bool + description: "Only convert a small subset" + + - name: OpenEphysRecordingInterface + module: neuroconv.datainterfaces + format: "OpenEphys (legacy .continuous or binary .dat)" + source_data: + folder_path: + type: DirectoryPath + description: "Path to OpenEphys directory" + stream_name: + type: str + description: "The name of the recording stream" + optional: true + block_index: + type: int + description: "The index of the block to extract" + optional: true + verbose: + type: bool + optional: true + default: false + es_key: + type: str + optional: true + default: "ElectricalSeries" + creates: + - ElectricalSeries + - Device + - ElectrodeGroup + - electrodes table + conversion_options: + stub_test: + type: bool + description: "Only convert a small subset" + notes: "Auto-detects legacy vs binary format and delegates to appropriate sub-interface" + + - name: OpenEphysBinaryRecordingInterface + module: neuroconv.datainterfaces + format: "OpenEphys Binary (.dat + .oebin)" + source_data: + folder_path: + type: DirectoryPath + description: "Path to directory containing OpenEphys binary files" + stream_name: + type: str + description: "The name of the recording stream to load" + optional: true + block_index: + type: int + description: "The index of the block to extract" + optional: true + verbose: + type: bool + optional: true + default: false + es_key: + type: str + optional: true + default: "ElectricalSeries" + creates: + - ElectricalSeries + - Device + - ElectrodeGroup + - electrodes table + conversion_options: + stub_test: + type: bool + description: "Only convert a small subset" + + - name: OpenEphysLegacyRecordingInterface + module: neuroconv.datainterfaces + format: "OpenEphys Legacy (.continuous)" + source_data: + folder_path: + type: DirectoryPath + description: "Path to directory containing OpenEphys legacy files" + stream_name: + type: str + description: "The name of the recording stream" + optional: true + block_index: + type: int + description: "The index of the block to extract" + optional: true + verbose: + type: bool + optional: true + default: false + es_key: + type: str + optional: true + default: "ElectricalSeries" + creates: + - ElectricalSeries + - Device + - ElectrodeGroup + - electrodes table + conversion_options: + stub_test: + type: bool + description: "Only convert a small subset" + + - name: PlexonRecordingInterface + module: neuroconv.datainterfaces + format: "Plexon wideband (.plx)" + source_data: + file_path: + type: FilePath + description: "Path to the .plx file" + verbose: + type: bool + optional: true + default: false + es_key: + type: str + optional: true + default: "ElectricalSeries" + stream_name: + type: str + optional: true + default: "WB-Wideband" + creates: + - ElectricalSeries + - Device + - ElectrodeGroup + - electrodes table + conversion_options: + stub_test: + type: bool + description: "Only convert a small subset" + + - name: Plexon2RecordingInterface + module: neuroconv.datainterfaces + format: "Plexon2 (.pl2)" + source_data: + file_path: + type: FilePath + description: "Path to the .pl2 file" + verbose: + type: bool + optional: true + default: false + es_key: + type: str + optional: true + default: "ElectricalSeries" + creates: + - ElectricalSeries + - Device + - ElectrodeGroup + - electrodes table + conversion_options: + stub_test: + type: bool + description: "Only convert a small subset" + + - name: Spike2RecordingInterface + module: neuroconv.datainterfaces + format: "Spike2/CED (.smrx/.smr)" + source_data: + file_path: + type: FilePath + description: "Path to .smr or .smrx file" + verbose: + type: bool + optional: true + default: false + es_key: + type: str + optional: true + default: "ElectricalSeries" + creates: + - ElectricalSeries + - Device + - ElectrodeGroup + - electrodes table + conversion_options: + stub_test: + type: bool + description: "Only convert a small subset" + + - name: SpikeGadgetsRecordingInterface + module: neuroconv.datainterfaces + format: "SpikeGadgets (.rec)" + source_data: + file_path: + type: FilePath + description: "Path to the .rec file" + stream_id: + type: str + optional: true + default: "trodes" + gains: + type: ArrayType + description: "Conversion factors for each channel (or single value for all)" + optional: true + verbose: + type: bool + optional: true + default: false + es_key: + type: str + optional: true + default: "ElectricalSeries" + creates: + - ElectricalSeries + - Device + - ElectrodeGroup + - electrodes table + conversion_options: + stub_test: + type: bool + description: "Only convert a small subset" + + - name: TdtRecordingInterface + module: neuroconv.datainterfaces + format: "Tucker-Davis Technologies (.tbk/.tev/.tsq/.tbx)" + source_data: + folder_path: + type: DirectoryPath + description: "Path to directory with TDT files (TSQ, TBK, TEV, SEV)" + gain: + type: float + description: "Conversion factor from int16 to microvolts" + stream_id: + type: str + description: "Stream to select (deprecated, use stream_name)" + optional: true + default: "0" + stream_name: + type: str + description: "Name of the stream to select" + optional: true + verbose: + type: bool + optional: true + default: false + es_key: + type: str + optional: true + default: "ElectricalSeries" + creates: + - ElectricalSeries + - Device + - ElectrodeGroup + - electrodes table + conversion_options: + stub_test: + type: bool + description: "Only convert a small subset" + + - name: WhiteMatterRecordingInterface + module: neuroconv.datainterfaces + format: "WhiteMatter binary (.bin)" + source_data: + file_path: + type: FilePath + description: "Path to the binary file" + sampling_frequency: + type: float + description: "The sampling frequency" + num_channels: + type: int + description: "Number of channels in the recording" + channel_ids: + type: list + description: "A list of channel ids" + optional: true + is_filtered: + type: bool + description: "If True, the recording is assumed to be filtered" + optional: true + verbose: + type: bool + optional: true + default: false + es_key: + type: str + optional: true + default: "ElectricalSeries" + creates: + - ElectricalSeries + - Device + - ElectrodeGroup + - electrodes table + conversion_options: + stub_test: + type: bool + description: "Only convert a small subset" + + sorting: + - name: BlackrockSortingInterface + module: neuroconv.datainterfaces + format: "Blackrock spike data (.nev)" + source_data: + file_path: + type: FilePath + description: "Path to the .nev data file" + sampling_frequency: + type: float + description: "Sampling frequency for the sorting extractor" + optional: true + nsx_to_load: + type: "int | list | str" + description: "IDs of nsX file from which to load data" + optional: true + verbose: + type: bool + optional: true + default: false + creates: + - Units table + - Device + - ElectrodeGroup + - electrodes table + conversion_options: + stub_test: + type: bool + description: "Only convert a small subset" + write_as: + type: str + description: "'units' or 'processing'" + units_name: + type: str + description: "Name of the units table" + + - name: CellExplorerSortingInterface + module: neuroconv.datainterfaces + format: "CellExplorer (.spikes.cellinfo.mat)" + source_data: + file_path: + type: FilePath + description: "Path to .spikes.cellinfo.mat file" + verbose: + type: bool + optional: true + default: false + creates: + - Units table (with clu_id, group_id, location, cell_type) + - Device + - ElectrodeGroup + - electrodes table + conversion_options: + stub_test: + type: bool + description: "Only convert a small subset" + write_as: + type: str + description: "'units' or 'processing'" + write_ecephys_metadata: + type: bool + description: "Write electrode information from metadata" + + - name: KiloSortSortingInterface + module: neuroconv.datainterfaces + format: "KiloSort output (Phy folder with params.py, .npy files)" + source_data: + folder_path: + type: DirectoryPath + description: "Path to the output Phy folder (containing the params.py)" + keep_good_only: + type: bool + description: "If True, only Kilosort-labeled 'good' units are returned" + optional: true + default: false + verbose: + type: bool + optional: true + default: false + creates: + - Units table (with KSLabel, Amplitude, ContamPct, depth, fr, etc.) + conversion_options: + stub_test: + type: bool + description: "Only convert a small subset" + write_as: + type: str + description: "'units' or 'processing'" + + - name: NeuralynxSortingInterface + module: neuroconv.datainterfaces + format: "Neuralynx sorting (.nse/.ntt/.nev)" + source_data: + folder_path: + type: DirectoryPath + description: "Path to folder containing Neuralynx sorting files" + sampling_frequency: + type: float + description: "Specific sampling frequency if desired" + optional: true + verbose: + type: bool + optional: true + default: false + stream_id: + type: str + description: "Used to calculate t_start" + optional: true + creates: + - Units table + conversion_options: + stub_test: + type: bool + description: "Only convert a small subset" + write_as: + type: str + description: "'units' or 'processing'" + + - name: NeuroScopeSortingInterface + module: neuroconv.datainterfaces + format: "NeuroScope (.res/.clu + .xml)" + source_data: + folder_path: + type: DirectoryPath + description: "Path to folder containing .clu and .res files" + keep_mua_units: + type: bool + description: "Whether to return sorted spikes from multi-unit activity" + optional: true + default: true + exclude_shanks: + type: "list[int]" + description: "List of shank indices to ignore" + optional: true + xml_file_path: + type: FilePath + description: "Path to .xml file with electrode config" + optional: true + verbose: + type: bool + optional: true + default: false + creates: + - Units table + - Device + - ElectrodeGroup + - electrodes table + conversion_options: + stub_test: + type: bool + description: "Only convert a small subset" + write_as: + type: str + description: "'units' or 'processing'" + + - name: OpenEphysSortingInterface + module: neuroconv.datainterfaces + format: "OpenEphys sorting (.spikes)" + source_data: + folder_path: + type: DirectoryPath + description: "Path to directory containing OpenEphys .spikes files" + experiment_id: + type: int + optional: true + default: 0 + recording_id: + type: int + optional: true + default: 0 + creates: + - Units table + conversion_options: + stub_test: + type: bool + description: "Only convert a small subset" + + - name: PhySortingInterface + module: neuroconv.datainterfaces + format: "Phy output (.npy files)" + source_data: + folder_path: + type: DirectoryPath + description: "Path to the output Phy folder (containing the params.py)" + exclude_cluster_groups: + type: "list[str]" + description: "Cluster groups to exclude (e.g. 'noise', 'mua')" + optional: true + verbose: + type: bool + optional: true + default: false + creates: + - Units table (with KSLabel, Amplitude, ContamPct, depth, fr, etc.) + conversion_options: + stub_test: + type: bool + description: "Only convert a small subset" + write_as: + type: str + description: "'units' or 'processing'" + + - name: PlexonSortingInterface + module: neuroconv.datainterfaces + format: "Plexon sorting (.plx)" + source_data: + file_path: + type: FilePath + description: "Path to the .plx file" + verbose: + type: bool + optional: true + default: false + creates: + - Units table + conversion_options: + stub_test: + type: bool + description: "Only convert a small subset" + write_as: + type: str + description: "'units' or 'processing'" + + lfp: + - name: AxonaLFPDataInterface + module: neuroconv.datainterfaces + format: "Axona LFP (.eeg files + .set)" + source_data: + file_path: + type: FilePath + description: "Path to .bin or .set file" + creates: + - ElectricalSeriesLFP (in ecephys processing module) + - Device + - ElectrodeGroup + - electrodes table + conversion_options: + stub_test: + type: bool + description: "Only convert a small subset" + write_as: + type: str + description: "'raw', 'lfp', or 'processed' (default: 'lfp')" + notes: "Loads all data into memory (not lazy)" + + - name: CellExplorerLFPInterface + module: neuroconv.datainterfaces + format: "CellExplorer LFP (.lfp + .session.mat)" + source_data: + folder_path: + type: DirectoryPath + description: "Folder containing the .session.mat file and .lfp binary" + verbose: + type: bool + optional: true + default: false + es_key: + type: str + optional: true + default: "ElectricalSeriesLFP" + creates: + - ElectricalSeriesLFP (in ecephys processing module) + - Device + - ElectrodeGroup + - electrodes table + conversion_options: + stub_test: + type: bool + description: "Only convert a small subset" + write_as: + type: str + description: "'raw', 'lfp', or 'processed' (default: 'lfp')" + + - name: NeuroScopeLFPInterface + module: neuroconv.datainterfaces + format: "NeuroScope LFP (.lfp/.eeg + .xml)" + source_data: + file_path: + type: FilePath + description: "Path to .lfp or .eeg file" + gain: + type: float + description: "Conversion factor int16 to Volts (e.g. 0.195)" + optional: true + xml_file_path: + type: FilePath + description: "Path to .xml file with electrode config" + optional: true + verbose: + type: bool + optional: true + default: false + creates: + - ElectricalSeriesLFP (in ecephys processing module) + - Device + - ElectrodeGroup + - electrodes table + conversion_options: + stub_test: + type: bool + description: "Only convert a small subset" + write_as: + type: str + description: "'raw', 'lfp', or 'processed' (default: 'lfp')" + + - name: PlexonLFPInterface + module: neuroconv.datainterfaces + format: "Plexon low-pass filtered (.plx)" + source_data: + file_path: + type: FilePath + description: "Path to the .plx file" + verbose: + type: bool + optional: true + default: false + es_key: + type: str + optional: true + default: "ElectricalSeriesLF" + stream_name: + type: str + optional: true + default: "FPl-Low Pass Filtered" + creates: + - ElectricalSeriesLFP (in ecephys processing module) + - Device + - ElectrodeGroup + - electrodes table + conversion_options: + stub_test: + type: bool + description: "Only convert a small subset" + + analog: + - name: SpikeGLXNIDQInterface + module: neuroconv.datainterfaces + format: "SpikeGLX NIDQ board (.nidq.bin + .nidq.meta)" + source_data: + folder_path: + type: DirectoryPath + description: "Path to folder containing the .nidq.bin file" + verbose: + type: bool + optional: true + default: false + metadata_key: + type: str + optional: true + default: "SpikeGLXNIDQ" + analog_channel_groups: + type: "dict[str, dict]" + description: "Dictionary mapping group names to analog channel configurations" + optional: true + digital_channel_groups: + type: "dict[str, dict]" + description: "Dictionary mapping group names to digital channel configurations with labels_map" + optional: true + creates: + - TimeSeries (analog channels) + - LabeledEvents (digital channels, from ndx-events) + - Device (NIDQBoard) + conversion_options: + stub_test: + type: bool + description: "Only convert a small subset" + always_write_timestamps: + type: bool + description: "If True, always writes timestamps instead of sampling rate" + + - name: SpikeGLXSyncChannelInterface + module: neuroconv.datainterfaces + format: "SpikeGLX sync channel from Neuropixel probes" + source_data: + folder_path: + type: DirectoryPath + description: "Path to folder containing the SpikeGLX .imec files" + stream_id: + type: str + description: "The stream ID for the sync channel (e.g. 'imec0.ap-SYNC', 'imec1.lf-SYNC')" + verbose: + type: bool + optional: true + default: false + metadata_key: + type: str + optional: true + default: "SpikeGLXSync" + creates: + - TimeSeries (sync channel) + - Device (NeuropixelsImec) + conversion_options: + stub_test: + type: bool + description: "Only convert a small subset" + + - name: IntanAnalogInterface + module: neuroconv.datainterfaces + format: "Intan non-amplifier analog streams (.rhd/.rhs)" + source_data: + file_path: + type: FilePath + description: "Path to either a .rhd or a .rhs file" + stream_name: + type: str + description: "Stream name: 'RHD2000 auxiliary input channel', 'USB board ADC input channel', 'DC Amplifier channel', etc." + verbose: + type: bool + optional: true + default: false + metadata_key: + type: str + optional: true + default: "TimeSeriesAnalogIntan" + creates: + - TimeSeries (analog data in acquisition) + - Device (Intan) + conversion_options: + stub_test: + type: bool + description: "Only convert a small subset" + + - name: EDFAnalogInterface + module: neuroconv.datainterfaces + format: "EDF auxiliary/analog channels (.edf)" + source_data: + file_path: + type: FilePath + description: "Path to the .edf file" + channels_to_include: + type: "list[str]" + description: "Specific channel IDs to include" + optional: true + verbose: + type: bool + optional: true + default: false + metadata_key: + type: str + optional: true + default: "analog_edf_metadata_key" + creates: + - TimeSeries (analog data in acquisition) + conversion_options: + stub_test: + type: bool + description: "Only convert a small subset" + + - name: OpenEphysBinaryAnalogInterface + module: neuroconv.datainterfaces + format: "OpenEphys Binary ADC/analog channels (.dat + .oebin)" + source_data: + folder_path: + type: DirectoryPath + description: "Path to OpenEphys directory (.dat files)" + stream_name: + type: str + description: "The name of the recording stream to load" + optional: true + block_index: + type: int + description: "The index of the block to extract" + optional: true + verbose: + type: bool + optional: true + default: false + time_series_name: + type: str + optional: true + default: "TimeSeriesOpenEphysAnalog" + creates: + - TimeSeries (ADC/analog data in acquisition) + conversion_options: + stub_test: + type: bool + description: "Only convert a small subset" + + position: + - name: AxonaPositionDataInterface + module: neuroconv.datainterfaces + format: "Axona position tracking (.bin/.set)" + source_data: + file_path: + type: str + description: "Path to .bin or .set file" + creates: + - Position (SpatialSeries in behavior processing module) + conversion_options: {} + + - name: AxonaUnitRecordingInterface + module: neuroconv.datainterfaces + format: "Axona unit recording (.bin/.set)" + source_data: + file_path: + type: FilePath + description: "Path to Axona file" + noise_std: + type: float + optional: true + default: 3.5 + creates: + - ElectricalSeries + - Device + - ElectrodeGroup + - electrodes table + conversion_options: + stub_test: + type: bool + description: "Only convert a small subset" + +ophys: + imaging: + - name: BrukerTiffMultiPlaneImagingInterface + module: neuroconv.datainterfaces + format: "Bruker TIFF multi-plane (.ome.tif + .xml + .env)" + source_data: + folder_path: + type: DirectoryPath + description: "Folder containing Bruker TIF image files and config files" + stream_name: + type: str + description: "The name of the recording stream (e.g. 'Ch2')" + optional: true + verbose: + type: bool + optional: true + default: false + creates: + - TwoPhotonSeries (volumetric) + - ImagingPlane + - Device (BrukerFluorescenceMicroscope) + - OpticalChannel + conversion_options: + stub_test: + type: bool + description: "Only convert a small subset" + stub_frames: + type: int + description: "Number of frames for stub test" + photon_series_type: + type: str + description: "'TwoPhotonSeries' or 'OnePhotonSeries'" + + - name: BrukerTiffSinglePlaneImagingInterface + module: neuroconv.datainterfaces + format: "Bruker TIFF single plane (.ome.tif + .xml + .env)" + source_data: + folder_path: + type: DirectoryPath + description: "Folder containing Bruker TIF image files and config files" + stream_name: + type: str + description: "The name of the recording stream (e.g. 'Ch2')" + optional: true + verbose: + type: bool + optional: true + default: false + creates: + - TwoPhotonSeries + - ImagingPlane + - Device (BrukerFluorescenceMicroscope) + - OpticalChannel + conversion_options: + stub_test: + type: bool + description: "Only convert a small subset" + photon_series_type: + type: str + description: "'TwoPhotonSeries' or 'OnePhotonSeries'" + + - name: FemtonicsImagingInterface + module: neuroconv.datainterfaces + format: "Femtonics MESc (.mesc)" + source_data: + file_path: + type: FilePath + description: "Path to the .mesc file" + session_name: + type: str + description: "Name of the MSession (e.g. 'MSession_0')" + optional: true + munit_name: + type: str + description: "Name of the MUnit (e.g. 'MUnit_0')" + optional: true + channel_name: + type: str + description: "Name of the channel to extract (e.g. 'UG', 'UR')" + optional: true + verbose: + type: bool + optional: true + default: false + creates: + - TwoPhotonSeries + - ImagingPlane (with grid_spacing, geometric transformations) + - Device (Femtonics microscope) + - OpticalChannel (with PMT settings) + conversion_options: + stub_test: + type: bool + description: "Only convert a small subset" + photon_series_type: + type: str + description: "'TwoPhotonSeries' or 'OnePhotonSeries'" + + - name: Hdf5ImagingInterface + module: neuroconv.datainterfaces + format: "HDF5 imaging (.h5/.hdf5)" + source_data: + file_path: + type: FilePath + description: "Path to .h5 or .hdf5 file" + mov_field: + type: str + optional: true + default: "mov" + sampling_frequency: + type: float + optional: true + start_time: + type: float + optional: true + metadata: + type: dict + optional: true + channel_names: + type: ArrayType + optional: true + verbose: + type: bool + optional: true + default: false + photon_series_type: + type: str + optional: true + default: "TwoPhotonSeries" + creates: + - TwoPhotonSeries or OnePhotonSeries + - ImagingPlane + - Device + - OpticalChannel + conversion_options: + stub_test: + type: bool + description: "Only convert a small subset" + + - name: InscopixImagingInterface + module: neuroconv.datainterfaces + format: "Inscopix (.isxd)" + source_data: + file_path: + type: FilePath + description: "Path to the .isxd Inscopix file" + verbose: + type: bool + optional: true + default: false + creates: + - OnePhotonSeries + - ImagingPlane (with acquisition details) + - Device (Inscopix microscope with serial number) + - OpticalChannel + - Subject metadata + conversion_options: + stub_test: + type: bool + description: "Only convert a small subset" + notes: "Automatically detects multiplane files and raises error (not yet supported)" + + - name: MicroManagerTiffImagingInterface + module: neuroconv.datainterfaces + format: "Micro-Manager TIFF (.ome.tif + DisplaySettings.json)" + source_data: + folder_path: + type: DirectoryPath + description: "Folder containing OME-TIF image files and DisplaySettings JSON" + verbose: + type: bool + optional: true + default: false + creates: + - TwoPhotonSeries + - ImagingPlane + - Device + - OpticalChannel + conversion_options: + stub_test: + type: bool + description: "Only convert a small subset" + + - name: MiniscopeImagingInterface + module: neuroconv.datainterfaces + format: "Miniscope (.avi + metaData.json + timeStamps.csv)" + source_data: + folder_path: + type: DirectoryPath + description: "Path to Miniscope folder containing .avi files and metaData.json" + optional: true + file_paths: + type: list + description: "List of .avi file paths for non-standard folder structures" + optional: true + configuration_file_path: + type: str + description: "Path to metaData.json (deprecated)" + optional: true + timeStamps_file_path: + type: str + description: "Path to timeStamps.csv file" + optional: true + verbose: + type: bool + optional: true + default: false + creates: + - OnePhotonSeries + - ImagingPlane + - Device (Miniscope, via ndx-miniscope) + - OpticalChannel + conversion_options: + stub_test: + type: bool + description: "Only convert a small subset" + photon_series_type: + type: str + description: "'OnePhotonSeries' (default) or 'TwoPhotonSeries'" + + - name: SbxImagingInterface + module: neuroconv.datainterfaces + format: "Scanbox (.sbx)" + source_data: + file_path: + type: FilePath + description: "Path to .sbx file" + sampling_frequency: + type: float + optional: true + verbose: + type: bool + optional: true + default: false + photon_series_type: + type: str + optional: true + default: "TwoPhotonSeries" + creates: + - TwoPhotonSeries or OnePhotonSeries + - ImagingPlane + - Device (Scanbox) + - OpticalChannel + conversion_options: + stub_test: + type: bool + description: "Only convert a small subset" + + - name: ScanImageImagingInterface + module: neuroconv.datainterfaces + format: "ScanImage TIFF (.tif/.tiff)" + source_data: + file_path: + type: FilePath + description: "Path to the ScanImage TIFF file (first file in multi-file series)" + optional: true + channel_name: + type: str + description: "Name of the channel to extract (e.g. 'Channel 1')" + optional: true + slice_sample: + type: int + description: "Specific frame from each slice in volumetric data" + optional: true + plane_index: + type: int + description: "Specific plane to extract from volumetric data" + optional: true + file_paths: + type: "list[FilePath]" + description: "Override automatic file detection with explicit file list" + optional: true + interleave_slice_samples: + type: bool + description: "Whether to interleave all slice samples as separate time points" + optional: true + fallback_sampling_frequency: + type: float + description: "Fallback sampling frequency if not in metadata" + optional: true + verbose: + type: bool + optional: true + default: false + creates: + - TwoPhotonSeries + - ImagingPlane + - Device + - OpticalChannel + conversion_options: + stub_test: + type: bool + description: "Only convert a small subset" + photon_series_type: + type: str + description: "'TwoPhotonSeries' or 'OnePhotonSeries'" + + - name: ScanImageLegacyImagingInterface + module: neuroconv.datainterfaces + format: "ScanImage Legacy TIFF (.tif/.tiff)" + source_data: + file_path: + type: FilePath + description: "Path to ScanImage TIFF file" + channel_name: + type: str + description: "Name of the channel to extract" + optional: true + plane_name: + type: str + description: "Name of the plane to extract" + optional: true + verbose: + type: bool + optional: true + default: false + creates: + - TwoPhotonSeries + - ImagingPlane + - Device + - OpticalChannel + conversion_options: + stub_test: + type: bool + description: "Only convert a small subset" + + - name: TiffImagingInterface + module: neuroconv.datainterfaces + format: "Multi-page TIFF (.tif/.tiff)" + source_data: + file_path: + type: FilePath + description: "Path to TIFF file (deprecated, use file_paths)" + optional: true + file_paths: + type: "list[FilePath]" + description: "List of paths to TIFF files" + optional: true + sampling_frequency: + type: float + description: "Sampling frequency in Hz" + dimension_order: + type: str + optional: true + default: "ZCT" + description: "Order of dimensions (Z, C, T)" + num_channels: + type: int + optional: true + default: 1 + channel_name: + type: str + optional: true + num_planes: + type: int + optional: true + default: 1 + verbose: + type: bool + optional: true + default: false + photon_series_type: + type: str + optional: true + default: "TwoPhotonSeries" + creates: + - TwoPhotonSeries or OnePhotonSeries + - ImagingPlane + - Device + - OpticalChannel + conversion_options: + stub_test: + type: bool + description: "Only convert a small subset" + + - name: ThorImagingInterface + module: neuroconv.datainterfaces + format: "ThorImageLS TIFF (.tif + Experiment.xml)" + source_data: + file_path: + type: FilePath + description: "Path to first OME TIFF file (e.g. ChanA_001_001_001_001.tif)" + channel_name: + type: str + description: "Name of the channel to extract (must match Experiment.xml)" + optional: true + verbose: + type: bool + optional: true + default: false + creates: + - TwoPhotonSeries + - ImagingPlane + - Device (ThorLabs 2P Microscope) + - OpticalChannel + conversion_options: + stub_test: + type: bool + description: "Only convert a small subset" + + segmentation: + - name: CaimanSegmentationInterface + module: neuroconv.datainterfaces + format: "CaImAn output (.hdf5)" + source_data: + file_path: + type: FilePath + description: "Path to .hdf5 file" + verbose: + type: bool + optional: true + default: false + creates: + - ImageSegmentation (PlaneSegmentation with ROI masks) + - Fluorescence (RoiResponseSeries) + - ImagingPlane + - Device + conversion_options: + stub_test: + type: bool + description: "Only convert a small subset" + include_roi_centroids: + type: bool + description: "Include ROI centroid coordinates" + include_roi_acceptance: + type: bool + description: "Include ROI acceptance status" + mask_type: + type: str + description: "'image', 'pixel', or 'voxel'" + + - name: CnmfeSegmentationInterface + module: neuroconv.datainterfaces + format: "CNMF-E output (.mat)" + source_data: + file_path: + type: FilePath + description: "Path to .mat file" + verbose: + type: bool + optional: true + default: false + creates: + - ImageSegmentation (PlaneSegmentation with ROI masks) + - Fluorescence (RoiResponseSeries) + - ImagingPlane + - Device + conversion_options: + stub_test: + type: bool + description: "Only convert a small subset" + + - name: ExtractSegmentationInterface + module: neuroconv.datainterfaces + format: "EXTRACT output (.mat)" + source_data: + file_path: + type: FilePath + description: "Path to .mat file" + sampling_frequency: + type: float + description: "Sampling frequency" + output_struct_name: + type: str + description: "Name of the output struct in the .mat file" + optional: true + verbose: + type: bool + optional: true + default: false + creates: + - ImageSegmentation (PlaneSegmentation with ROI masks) + - Fluorescence (RoiResponseSeries) + - ImagingPlane + - Device + conversion_options: + stub_test: + type: bool + description: "Only convert a small subset" + + - name: InscopixSegmentationInterface + module: neuroconv.datainterfaces + format: "Inscopix segmentation (.isxd)" + source_data: + file_path: + type: FilePath + description: "Path to the .isxd Inscopix file" + verbose: + type: bool + optional: true + default: false + creates: + - ImageSegmentation (PlaneSegmentation with ROI masks) + - Fluorescence (RoiResponseSeries) + - ImagingPlane + - Device (Inscopix) + - Subject metadata + conversion_options: + stub_test: + type: bool + description: "Only convert a small subset" + + - name: MinianSegmentationInterface + module: neuroconv.datainterfaces + format: "Minian output (.zarr)" + source_data: + folder_path: + type: DirectoryPath + description: "Path to .zarr output folder" + sampling_frequency: + type: float + description: "Sampling frequency in Hz" + optional: true + timestamps_path: + type: FilePath + description: "Path to the timeStamps.csv file" + optional: true + verbose: + type: bool + optional: true + default: false + creates: + - ImageSegmentation (PlaneSegmentation with ROI masks) + - Fluorescence (RoiResponseSeries) + - ImagingPlane + - Device + conversion_options: + stub_test: + type: bool + description: "Only convert a small subset" + include_background_segmentation: + type: bool + description: "Include background segmentation" + include_roi_centroids: + type: bool + description: "Include ROI centroid coordinates" + mask_type: + type: str + description: "'image', 'pixel', or 'voxel'" + + - name: SimaSegmentationInterface + module: neuroconv.datainterfaces + format: "SIMA output (.sima)" + source_data: + file_path: + type: FilePath + description: "Path to .sima file" + sima_segmentation_label: + type: str + optional: true + default: "auto_ROIs" + creates: + - ImageSegmentation (PlaneSegmentation with ROI masks) + - Fluorescence (RoiResponseSeries) + - ImagingPlane + - Device + conversion_options: + stub_test: + type: bool + description: "Only convert a small subset" + + - name: Suite2pSegmentationInterface + module: neuroconv.datainterfaces + format: "Suite2p output (.npy files in plane# folders)" + source_data: + folder_path: + type: DirectoryPath + description: "Path to Suite2p folder containing 'plane#' sub-folders" + channel_name: + type: str + description: "The name of the channel to load" + optional: true + plane_name: + type: str + description: "The name of the plane to load (e.g. 'plane0')" + optional: true + plane_segmentation_name: + type: str + description: "The name of the plane segmentation to be added" + optional: true + verbose: + type: bool + optional: true + default: false + creates: + - ImageSegmentation (PlaneSegmentation with ROI masks) + - Fluorescence (RoiResponseSeries) + - ImagingPlane + - Device + conversion_options: + stub_test: + type: bool + description: "Only convert a small subset" + include_roi_centroids: + type: bool + description: "Include ROI centroid coordinates" + include_roi_acceptance: + type: bool + description: "Include iscell classification" + mask_type: + type: str + description: "'image', 'pixel', or 'voxel'" + + fiber_photometry: + - name: TDTFiberPhotometryInterface + module: neuroconv.datainterfaces + format: "TDT fiber photometry (Tbk/Tdx/tev/tin/tsq)" + source_data: + folder_path: + type: DirectoryPath + description: "Path to the folder containing TDT data" + verbose: + type: bool + optional: true + default: false + creates: + - FiberPhotometry (ndx-fiber-photometry) + - OpticFiber, ExcitationSource, Photodetector (ndx-ophys-devices) + conversion_options: {} + +behavior: + pose_estimation: + - name: DeepLabCutInterface + module: neuroconv.datainterfaces + format: "DeepLabCut output (.h5 or .csv)" + source_data: + file_path: + type: FilePath + description: "Path to the DLC output file (.h5 or .csv)" + config_file_path: + type: FilePath + description: "Path to .yml config file" + optional: true + subject_name: + type: str + optional: true + default: "ind1" + pose_estimation_metadata_key: + type: str + optional: true + default: "PoseEstimationDeepLabCut" + verbose: + type: bool + optional: true + default: false + creates: + - PoseEstimation (ndx-pose, in behavior processing module) + - PoseEstimationSeries (per bodypart) + - Skeleton + conversion_options: {} + + - name: SLEAPInterface + module: neuroconv.datainterfaces + format: "SLEAP output (.slp)" + source_data: + file_path: + type: FilePath + description: "Path to the .slp file" + video_file_path: + type: FilePath + description: "Path of the video for extracting timestamps" + optional: true + verbose: + type: bool + optional: true + default: false + frames_per_second: + type: float + description: "FPS of the video" + optional: true + creates: + - PoseEstimation (ndx-pose, in behavior processing module) + - PoseEstimationSeries (per bodypart) + - Skeleton + conversion_options: {} + + - name: LightningPoseDataInterface + module: neuroconv.datainterfaces + format: "Lightning Pose predictions (.csv + .mp4)" + source_data: + file_path: + type: FilePath + description: "Path to .csv file with predictions" + original_video_file_path: + type: FilePath + description: "Path to the original video file (.mp4)" + labeled_video_file_path: + type: FilePath + description: "Path to the labeled video file (.mp4)" + optional: true + verbose: + type: bool + optional: true + default: false + creates: + - PoseEstimation (ndx-pose, in behavior processing module) + - PoseEstimationSeries (per bodypart) + conversion_options: {} + + tracking: + - name: FicTracDataInterface + module: neuroconv.datainterfaces + format: "FicTrac (.dat)" + source_data: + file_path: + type: FilePath + description: "Path to the FicTrac .dat file" + configuration_file_path: + type: FilePath + description: "Path to the FicTrac configuration file" + optional: true + verbose: + type: bool + optional: true + default: false + creates: + - Position (multiple SpatialSeries in behavior processing module) + - SpatialSeries for rotation, heading, speed, movement + conversion_options: + reference_frame: + type: str + description: "Reference frame for spatial series" + + - name: NeuralynxNvtInterface + module: neuroconv.datainterfaces + format: "Neuralynx position tracking (.nvt)" + source_data: + file_path: + type: FilePath + description: "Path to the .nvt file" + verbose: + type: bool + optional: true + default: false + creates: + - Position (SpatialSeries in behavior processing module) + - CompassDirection (SpatialSeries for head angle) + conversion_options: {} + + video: + - name: ExternalVideoInterface + module: neuroconv.datainterfaces + format: "Video files (.mp4/.avi/.wmv/.mov/.flv/.mkv) - external reference" + source_data: + file_paths: + type: "list[FilePath]" + description: "List of video file paths in sorted, consecutive order" + verbose: + type: bool + optional: true + default: false + video_name: + type: str + description: "Name of this video in the ImageSeries" + optional: true + creates: + - ImageSeries (with external_file reference) + - Device (camera) + conversion_options: {} + notes: "Videos stored as external references (file paths), not embedded in NWB" + + - name: InternalVideoInterface + module: neuroconv.datainterfaces + format: "Video file (.mp4/.avi/.wmv/.mov/.flv/.mkv) - embedded" + source_data: + file_path: + type: FilePath + description: "Path to the video file" + verbose: + type: bool + optional: true + default: false + video_name: + type: str + description: "Name of this video in the ImageSeries" + optional: true + creates: + - ImageSeries (with data stored internally) + - Device (camera) + conversion_options: {} + notes: "Video data embedded directly in NWB file" + + - name: MiniscopeBehaviorInterface + module: neuroconv.datainterfaces + format: "Miniscope behavior camera (.avi + metaData.json)" + source_data: + folder_path: + type: DirectoryPath + description: "The main Miniscope folder with BehavCam subfolders" + verbose: + type: bool + optional: true + default: false + creates: + - ImageSeries (BehavCamImageSeries with external file) + - Device (Miniscope BehavCam, via ndx-miniscope) + conversion_options: {} + + orientation: + - name: MiniscopeHeadOrientationInterface + module: neuroconv.datainterfaces + format: "Miniscope head orientation (headOrientation.csv from BNO055 IMU)" + source_data: + file_path: + type: FilePath + description: "Path to headOrientation.csv with columns: Time Stamp (ms), qw, qx, qy, qz" + metadata_key: + type: str + optional: true + default: "TimeSeriesMiniscopeHeadOrientation" + verbose: + type: bool + optional: true + default: false + creates: + - TimeSeries (quaternion data in behavior processing module) + conversion_options: {} + + audio: + - name: AudioInterface + module: neuroconv.datainterfaces + format: "WAV audio (.wav)" + source_data: + file_paths: + type: "list[FilePath]" + description: "List of .wav file paths in sorted, consecutive order" + verbose: + type: bool + optional: true + default: false + creates: + - AcousticWaveformSeries (ndx-sound, in acquisition) + conversion_options: {} + + operant: + - name: MedPCInterface + module: neuroconv.datainterfaces + format: "MedPC output (.txt)" + source_data: + file_path: + type: FilePath + description: "Path to the MedPC file" + session_conditions: + type: dict + description: "Conditions defining the session (e.g. {'Start Date': '11/09/18'})" + start_variable: + type: str + description: "Name of the variable that starts the session" + metadata_medpc_name_to_info_dict: + type: dict + description: "Mapping of MedPC variable names to info dicts with 'name' and 'is_array'" + aligned_timestamp_names: + type: "list[str]" + description: "Variables with externally aligned timestamps" + optional: true + verbose: + type: bool + optional: true + default: false + creates: + - Events (ndx-events, in acquisition) + - BehavioralEpochs (IntervalSeries) + conversion_options: {} + +icephys: + - name: AbfInterface + module: neuroconv.datainterfaces + format: "Axon Binary Format for intracellular electrophysiology (.abf)" + source_data: + file_paths: + type: "list[FilePath]" + description: "Array of paths to ABF files" + icephys_metadata: + type: dict + description: "Metadata for this experiment" + optional: true + icephys_metadata_file_path: + type: FilePath + description: "Path to JSON file containing metadata" + optional: true + creates: + - IntracellularRecordingsTable + - CurrentClampStimulusSeries / VoltageClampStimulusSeries + - CurrentClampSeries / VoltageClampSeries + - Device (Axon Instruments) + - IntracellularElectrode + conversion_options: {} + +text: + - name: CsvTimeIntervalsInterface + module: neuroconv.datainterfaces + format: "CSV file (.csv)" + source_data: + file_path: + type: FilePath + description: "Path to the CSV file" + read_kwargs: + type: dict + description: "Additional kwargs passed to pandas.read_csv()" + optional: true + verbose: + type: bool + optional: true + default: false + creates: + - TimeIntervals (trials table or custom intervals) + conversion_options: + tag: + type: str + description: "Tag for the time intervals table (e.g. 'trials')" + + - name: ExcelTimeIntervalsInterface + module: neuroconv.datainterfaces + format: "Excel file (.xlsx/.xls/.xlsm)" + source_data: + file_path: + type: FilePath + description: "Path to the Excel file" + read_kwargs: + type: dict + description: "Additional kwargs passed to pandas.read_excel()" + optional: true + verbose: + type: bool + optional: true + default: false + creates: + - TimeIntervals (trials table or custom intervals) + conversion_options: + tag: + type: str + description: "Tag for the time intervals table (e.g. 'trials')" + +image: + - name: ImageInterface + module: neuroconv.datainterfaces + format: "Image files (.png/.jpg/.jpeg/.tiff/.tif/.webp)" + source_data: + file_paths: + type: "list[str | Path]" + description: "List of paths to image files" + optional: true + folder_path: + type: "str | Path" + description: "Path to folder containing images" + optional: true + images_location: + type: str + description: "'acquisition' or 'stimulus'" + optional: true + default: "acquisition" + metadata_key: + type: str + optional: true + default: "Images" + verbose: + type: bool + optional: true + default: true + creates: + - Images container (GrayscaleImage, RGBImage, or RGBAImage) + conversion_options: {} + notes: "Either file_paths or folder_path must be provided, not both" diff --git a/src/pyflask/ai/skill/knowledge/nwb-best-practices.md b/src/pyflask/ai/skill/knowledge/nwb-best-practices.md new file mode 100644 index 000000000..a50843386 --- /dev/null +++ b/src/pyflask/ai/skill/knowledge/nwb-best-practices.md @@ -0,0 +1,108 @@ +# NWB Best Practices + +Distilled from the [official NWB Inspector best practices](https://github.com/NeurodataWithoutBorders/nwbinspector/tree/dev/docs/best_practices). +These are conventions and common-mistake guards that the NWB Inspector checks for. +The conversion agent should follow these when generating code. + +## General + +- **CamelCase for neurodata_type names** (e.g., `ElectricalSeries`, `SpatialSeries`). +- **snake_case for object names** (groups, datasets, attributes). No spaces — use underscores. +- **No slashes or colons in names** — these are path separators in HDF5. +- **No empty strings** — every `description`, `unit`, and text field must have meaningful content. Empty strings and placeholder text like "no description" will be flagged. +- **Avoid metadata duplication** — don't store the same metadata in multiple places. For example, don't add `unit` or `gain` columns to the electrodes table when those belong on `ElectricalSeries`. + +## NWBFile Metadata + +- **File extension**: always `.nwb`. +- **`identifier`**: must be globally unique. Use `str(uuid.uuid4())`. +- **`session_start_time`**: must include timezone info. All other timestamps are relative to this. +- **`timestamps_reference_time`**: defaults to `session_start_time`. Only set explicitly if different. +- **`session_id`**: should be unique across sessions in a dataset. Use a descriptive string, not just a number. +- **`session_description`**: required. Describe what happened in this session. +- **`experiment_description`**: describe the scientific goal. Can use the paper abstract. +- **`experimenter`**: list of strings in "Last, First" format. +- **`institution`**: name of the institution. +- **`keywords`**: list of relevant keywords for discoverability. +- **`related_publications`**: use DOI format: `"doi:10.xxxx/xxxxx"`. +- **Acquisition vs. processing**: raw data goes in `nwbfile.acquisition`. Processed/derived data goes in `nwbfile.processing["module_name"]`. +- **Processing module names**: use standard names: `"ecephys"`, `"ophys"`, `"behavior"`, `"misc"`. Custom names are allowed but standard names enable tool interoperability. + +## Subject + +- **Subject must exist**: every NWB file should have a `Subject` object. +- **`subject_id`**: required for DANDI. Unique identifier for the animal. +- **`sex`**: one of `"M"`, `"F"`, `"U"` (unknown), `"O"` (other). Single uppercase letter. +- **`species`**: Latin binomial (e.g., `"Mus musculus"`) or NCBI taxonomy URI (e.g., `"http://purl.obolibrary.org/obo/NCBITaxon_10090"`). Never use common names like "mouse". +- **`strain`**: the specific strain (e.g., `"C57BL/6J"`). Separate from species. +- **`age`**: ISO 8601 duration format: `"P90D"` (90 days), `"P12W"` (12 weeks), `"P3M"` (3 months). A reference age can be expressed as a range: `"P90D/P120D"`. +- **`date_of_birth`**: preferred over `age` when available (datetime with timezone). +- **`weight`**: format as `"numeric unit"`, e.g., `"0.025 kg"` or `"25 g"`. + +## Time Series + +- **Time-first data orientation**: the first dimension of `data` must be time. If your array is `(channels, timepoints)`, transpose it to `(timepoints, channels)`. +- **SI units**: `unit` should be SI where possible (meters, seconds, volts, amperes). Use `conversion` parameter instead of transforming data. +- **Timestamps must be in seconds**: all timestamps are in seconds relative to `session_start_time`. +- **Timestamps must be ascending**: timestamps array must be sorted in ascending order. +- **No NaN in timestamps**: timestamps must never contain NaN values. +- **Use `rate` + `starting_time` for regular sampling**: if data has a constant sampling rate, set `rate` (Hz) and `starting_time` (seconds) instead of providing a `timestamps` array. This saves space and is more precise. +- **Avoid negative timestamps**: all timestamps should be >= 0. Negative timestamps imply data before `session_start_time`, which is usually an error. +- **Use chunking and compression**: for large datasets, use `H5DataIO` with `compression="gzip"` and appropriate chunk sizes. +- **`resolution`**: set to `-1.0` if unknown. Otherwise, provide the smallest meaningful difference between data values. +- **Rate must be positive and nonzero**: if using `rate`, it must be > 0. +- **Use appropriate TimeSeries subtypes**: don't use bare `TimeSeries` when a more specific type exists (e.g., `ElectricalSeries` for ephys, `SpatialSeries` for position). +- **Breaks in continuity**: if there are gaps in recording, either use separate `TimeSeries` objects or provide explicit `timestamps` (not `rate`) to capture the gaps. + +## Tables (DynamicTable) + +- **No JSON strings in columns**: if a column value is structured data, use a proper column type (VectorData, DynamicTableRegion, etc.), not a JSON-encoded string. +- **No empty tables**: don't create DynamicTable objects with zero rows. +- **Boolean columns**: name boolean columns with `is_` prefix (e.g., `is_correct`, `is_rewarded`). +- **Timing columns**: name columns containing times with `_time` suffix (e.g., `start_time`, `stop_time`). Use `_times` for ragged arrays of times. +- **Unique IDs**: the `id` column of any DynamicTable should contain unique values. Don't override with non-unique values — use a custom column instead. +- **Avoid single-row tables**: if a table has only one row, consider if there's a more appropriate container. + +## Extracellular Electrophysiology (ecephys) + +- **Electrode `location` is required**: fill with your best estimate of the brain region. Use `"unknown"` if truly unknown. +- **Use Allen Brain Atlas ontology**: for mice, use Allen Brain Atlas terms (full name or abbreviation). Don't invent terms. +- **Anatomical coordinates (`x`, `y`, `z`)**: for precise brain coordinates. For mice, use Allen Institute Common Coordinate Framework v3 (+x = posterior, +y = inferior, +z = right). +- **Relative coordinates (`rel_x`, `rel_y`, `rel_z`)**: for electrode position on the probe. Used by spike sorters to determine proximity. +- **Don't duplicate metadata in electrodes table**: don't add `unit`, `gain`, `offset` columns — those belong on `ElectricalSeries` (`channel_conversion`, `offset`). +- **Spike times must be ascending**: within each unit, spike times must be in ascending order. +- **Spike times must be positive**: all spike times >= 0. Negative times suggest trial-alignment that should be corrected to session-alignment. +- **Use `obs_intervals`**: if the recording has gaps where a unit was not observable, set `obs_intervals` on the units table. No spikes should exist outside observed intervals. + +## Optical Physiology (ophys) + +- **`image_mask` shape consistency**: the `image_mask` column of `PlaneSegmentation` must have the same shape as `reference_images`. +- **ImagingPlane required fields**: always set `excitation_lambda`, `indicator`, and `location` on `ImagingPlane`. +- **TwoPhotonSeries rate**: must be nonzero. Get from Suite2p `ops["fs"]` or calculate from timestamps. +- **Store raw imaging data internally**: use chunking + lossless compression (not external file mode). + +## Behavior + +- **SpatialSeries dimensionality**: must have 1 (x), 2 (x,y), or 3 (x,y,z) columns. Not more. +- **SpatialSeries is only for position**: velocity, acceleration, and other derived signals should use `TimeSeries` or `BehavioralTimeSeries`, not `SpatialSeries`. +- **CompassDirection units**: must be `"degrees"` or `"radians"`. +- **CompassDirection data range**: degrees must be in [-360, 360]; radians in [-2pi, 2pi]. + +## Image Series + +- **External mode for animal videos**: behavioral videos (webcam, etc.) should use `external_file` to reference the video file alongside the NWB file. This allows video-optimized lossy codecs. +- **Internal storage for neural imaging**: TwoPhotonSeries and similar neural data should be stored inside the NWB file with lossless compression. +- **Relative paths for external files**: `external_file` paths should be relative to the NWB file location. +- **`starting_frame`**: only set when using `external_file`. Not applicable for internally stored data. + +## Optogenetics + +- **Every `OptogeneticStimulusSite` must have an `OptogeneticSeries`**: don't create stimulus sites without corresponding stimulus data. + +## Extensions + +- **Use sparingly**: prefer core NWB types and DynamicTable columns before creating extensions. +- **Check for existing extensions** in the NDX Catalog before creating new ones. +- **Use `ndx-template`** to scaffold new extensions. +- **Cache the spec**: always write the extension specification into the NWB file (`cache_spec=True`). +- **Flag for human expert**: the conversion skill should flag when an extension might be needed rather than creating one automatically. diff --git a/src/pyflask/ai/skill/knowledge/pynwb-advanced-io.md b/src/pyflask/ai/skill/knowledge/pynwb-advanced-io.md new file mode 100644 index 000000000..286fb515a --- /dev/null +++ b/src/pyflask/ai/skill/knowledge/pynwb-advanced-io.md @@ -0,0 +1,98 @@ +# Advanced I/O — PyNWB Patterns + +Patterns for efficient storage of large datasets. + +## H5DataIO — Compression and Chunking + +```python +from hdmf.backends.hdf5.h5_utils import H5DataIO + +# Basic gzip compression (good default) +compressed = H5DataIO(data=large_array, compression="gzip") + +# Higher compression level (1-9, default 4) +compressed = H5DataIO(data=large_array, compression="gzip", compression_opts=9) + +# LZF — faster compression/decompression, lower ratio +compressed = H5DataIO(data=large_array, compression="lzf") + +# Custom chunk shape (important for access patterns) +compressed = H5DataIO( + data=large_array, # shape: (n_frames, height, width) + compression="gzip", + chunks=(1, height, width), # one frame per chunk for frame-by-frame access +) + +# For time series data — chunk along time axis +compressed = H5DataIO( + data=traces, # shape: (n_timepoints, n_channels) + compression="gzip", + chunks=(10000, n_channels), # 10k timepoints per chunk +) +``` + +## DataChunkIterator — Datasets Too Large for Memory + +When data doesn't fit in RAM, use `DataChunkIterator` to stream data during write: + +```python +from hdmf.data_utils import DataChunkIterator + +def data_generator(): + """Yield one chunk at a time from files on disk.""" + for file_path in sorted(data_files): + chunk = np.load(file_path) # load one chunk at a time + yield chunk + +data_iterator = DataChunkIterator( + data=data_generator(), + maxshape=(None, n_channels), # None = unlimited along first dim + dtype=np.float32, +) + +ts = TimeSeries( + name="large_recording", + data=H5DataIO(data_iterator, compression="gzip"), + rate=30000.0, + unit="volts", +) +nwbfile.add_acquisition(ts) +``` + +## GenericDataChunkIterator — From Existing Arrays + +For arrays that are already memory-mapped (e.g., from HDF5 or memmap): + +```python +from hdmf.data_utils import GenericDataChunkIterator + +class MyIterator(GenericDataChunkIterator): + def _get_data(self, selection): + return my_memmap[selection] + + def _get_maxshape(self): + return my_memmap.shape + + def _get_dtype(self): + return my_memmap.dtype + +iterator = MyIterator(buffer_gb=1.0) # process 1 GB at a time +``` + +## When to Use Each Approach + +| Data Size | Approach | +|-----------|----------| +| < 1 GB | `H5DataIO(data=array, compression="gzip")` | +| 1-10 GB | `H5DataIO` with explicit `chunks` tuned for access pattern | +| > 10 GB | `DataChunkIterator` or `GenericDataChunkIterator` to stream | +| Memory-mapped source | `GenericDataChunkIterator` subclass | + +## Notes + +- Always use compression for large datasets. `gzip` is the safest default (universally + supported). `lzf` is faster but HDF5-specific. +- Chunk shape should match the most common access pattern: if you read frames one at a + time, chunk by frame; if you read channels, chunk by channel. +- `maxshape=(None, ...)` allows the dataset to be extended along the first dimension. +- The `buffer_gb` parameter on `GenericDataChunkIterator` controls memory usage. diff --git a/src/pyflask/ai/skill/knowledge/pynwb-behavior.md b/src/pyflask/ai/skill/knowledge/pynwb-behavior.md new file mode 100644 index 000000000..24e7b47e0 --- /dev/null +++ b/src/pyflask/ai/skill/knowledge/pynwb-behavior.md @@ -0,0 +1,137 @@ +# Behavior Containers — PyNWB Patterns + +All behavior container types and when to use each. + +## Container Selection Guide + +| Data Type | Container | Child Type | Example | +|-----------|-----------|-----------|---------| +| Spatial position (x, y, z) | `Position` | `SpatialSeries` | Running on linear track | +| Continuous signals | `BehavioralTimeSeries` | `TimeSeries` | Running speed, lick rate | +| Irregular events | `BehavioralEvents` | `TimeSeries` | Lever presses at variable times | +| Pupil diameter | `PupilTracking` | `TimeSeries` | Eye tracking pupil size | +| Gaze position | `EyeTracking` | `SpatialSeries` | Eye tracking x,y position | +| Head direction | `CompassDirection` | `SpatialSeries` | Angular heading | + +All containers go in `processing["behavior"]`. + +## Position + +```python +from pynwb.behavior import Position, SpatialSeries +from neuroconv.tools.nwb_helpers import get_module + +position = Position() +position.create_spatial_series( + name="animal_position", + data=pos_xy, # shape: (n_timepoints, 2) + timestamps=timestamps, + unit="meters", + reference_frame="Top-left corner of arena", + conversion=0.01, # if data is in cm +) + +behavior = get_module(nwbfile, "behavior", "Processed behavioral data") +behavior.add(position) +``` + +## BehavioralTimeSeries + +For **continuous** behavioral signals sampled at regular intervals: + +```python +from pynwb.behavior import BehavioralTimeSeries +from pynwb import TimeSeries + +bts = BehavioralTimeSeries() +bts.create_timeseries( + name="running_speed", + data=speed, + rate=30.0, + unit="m/s", + description="Treadmill running speed", +) +bts.create_timeseries( + name="lick_rate", + data=lick_rate, + rate=30.0, + unit="licks/s", + description="Lick rate smoothed over 100ms", +) +behavior.add(bts) +``` + +## BehavioralEvents + +For **irregularly timed** behavioral events: + +```python +from pynwb.behavior import BehavioralEvents + +be = BehavioralEvents() +be.create_timeseries( + name="lever_presses", + data=np.ones(n_presses), # amplitude/value at each event + timestamps=press_times, # irregular timestamps + unit="n.a.", + description="Times of lever press events", +) +behavior.add(be) +``` + +## PupilTracking + +```python +from pynwb.behavior import PupilTracking + +pt = PupilTracking() +pt.create_timeseries( + name="pupil_diameter", + data=pupil_diam, + rate=60.0, + unit="meters", + conversion=1e-3, # if data is in mm + description="Pupil diameter from DeepLabCut", +) +behavior.add(pt) +``` + +## EyeTracking + +```python +from pynwb.behavior import EyeTracking, SpatialSeries + +et = EyeTracking() +et.create_spatial_series( + name="gaze_position", + data=gaze_xy, # shape: (n_timepoints, 2) + rate=60.0, + unit="meters", + reference_frame="Screen center", + description="Gaze position from eye tracker", +) +behavior.add(et) +``` + +## CompassDirection + +```python +from pynwb.behavior import CompassDirection, SpatialSeries + +cd = CompassDirection() +cd.create_spatial_series( + name="head_direction", + data=heading_angles, # shape: (n_timepoints,) + rate=30.0, + unit="radians", # must be "radians" or "degrees" + reference_frame="0=East, pi/2=North", +) +behavior.add(cd) +``` + +## Notes + +- `SpatialSeries` is only for position data (1-3 columns). For velocity, acceleration, + or other derived signals, use `TimeSeries` inside `BehavioralTimeSeries`. +- `CompassDirection` data must be in `[-2pi, 2pi]` (radians) or `[-360, 360]` (degrees). +- Prefer `rate` + `starting_time` over `timestamps` for regularly sampled data. diff --git a/src/pyflask/ai/skill/knowledge/pynwb-icephys.md b/src/pyflask/ai/skill/knowledge/pynwb-icephys.md new file mode 100644 index 000000000..e4dec7042 --- /dev/null +++ b/src/pyflask/ai/skill/knowledge/pynwb-icephys.md @@ -0,0 +1,125 @@ +# Intracellular Electrophysiology (icephys) — PyNWB Patterns + +Construction patterns for patch clamp / intracellular recording data. + +## Device + Electrode + +```python +from pynwb.icephys import IntracellularElectrode + +device = nwbfile.create_device( + name="Amplifier", + description="MultiClamp 700B", + manufacturer="Molecular Devices", +) + +electrode = nwbfile.create_icephys_electrode( + name="electrode_0", + description="Patch clamp electrode", + device=device, +) +``` + +## Recording Series Types + +**CurrentClampSeries** — response recorded during current injection: +```python +from pynwb.icephys import CurrentClampSeries + +cc_response = CurrentClampSeries( + name="current_clamp_response", + data=voltage_trace, # recorded voltage (numpy array) + electrode=electrode, + rate=20000.0, # sampling rate in Hz + unit="volts", + gain=1.0, + stimulus_description="step_protocol", + sweep_number=np.uint32(0), # optional, for grouping sweeps +) +nwbfile.add_acquisition(cc_response) +``` + +**CurrentClampStimulusSeries** — the injected current waveform: +```python +from pynwb.icephys import CurrentClampStimulusSeries + +cc_stimulus = CurrentClampStimulusSeries( + name="current_clamp_stimulus", + data=current_waveform, # injected current (numpy array) + electrode=electrode, + rate=20000.0, + unit="amperes", + gain=1.0, + sweep_number=np.uint32(0), +) +nwbfile.add_stimulus(cc_stimulus) +``` + +**VoltageClampSeries** — response recorded during voltage clamp: +```python +from pynwb.icephys import VoltageClampSeries + +vc_response = VoltageClampSeries( + name="voltage_clamp_response", + data=current_trace, # recorded current + electrode=electrode, + rate=20000.0, + unit="amperes", + gain=1.0, + stimulus_description="voltage_step", +) +nwbfile.add_acquisition(vc_response) +``` + +**VoltageClampStimulusSeries** — the command voltage: +```python +from pynwb.icephys import VoltageClampStimulusSeries + +vc_stimulus = VoltageClampStimulusSeries( + name="voltage_clamp_stimulus", + data=voltage_command, + electrode=electrode, + rate=20000.0, + unit="volts", + gain=1.0, +) +nwbfile.add_stimulus(vc_stimulus) +``` + +**IZeroClampSeries** — recording with no current injection (I=0 mode): +```python +from pynwb.icephys import IZeroClampSeries + +izero = IZeroClampSeries( + name="izero_response", + data=voltage_trace, + electrode=electrode, + rate=20000.0, + unit="volts", + stimulus_description="I=0", +) +nwbfile.add_acquisition(izero) +``` + +## Notes + +- **Sweep tables are deprecated.** Use `sweep_number` on individual series to group + stimulus/response pairs from the same sweep, but do not use IntracellularRecordingsTable + or the higher-level sweep table hierarchy. +- Each electrode represents a physical pipette. Multiple sweeps use the same electrode. +- Stimulus and response series should be paired: for each stimulus series, there should + be a corresponding acquisition series recorded from the same electrode. +- `gain` is the amplifier gain (float). Set to `1.0` if gain is already applied to data. + +## Metadata YAML Template + +```yaml +Icephys: + Device: + - name: Amplifier + description: MultiClamp 700B patch clamp amplifier + manufacturer: Molecular Devices + IntracellularElectrode: + - name: electrode_0 + description: Borosilicate glass pipette, 3-5 MOhm +``` diff --git a/src/pyflask/ai/skill/knowledge/pynwb-images.md b/src/pyflask/ai/skill/knowledge/pynwb-images.md new file mode 100644 index 000000000..13039b396 --- /dev/null +++ b/src/pyflask/ai/skill/knowledge/pynwb-images.md @@ -0,0 +1,59 @@ +# Images — PyNWB Patterns + +Patterns for static images and video references in NWB files. + +## Static Images + +```python +from pynwb.image import GrayscaleImage, RGBImage, RGBAImage, Images + +# Single grayscale image (e.g., mean projection) +mean_img = GrayscaleImage( + name="mean_projection", + data=mean_array, # shape: (height, width), dtype float or uint + description="Mean fluorescence projection", +) + +# RGB image (e.g., histology) +histology = RGBImage( + name="histology", + data=rgb_array, # shape: (height, width, 3) + description="Post-hoc histology image", +) + +# Group related images +images = Images( + name="reference_images", + images=[mean_img, histology], + description="Reference images for this session", +) +nwbfile.add_acquisition(images) +``` + +## ImageSeries — External Video Files + +For behavioral videos, use `external_file` to reference videos alongside the NWB file. +This avoids re-encoding video data and preserves the original codec. + +```python +from pynwb.image import ImageSeries + +video = ImageSeries( + name="behavior_video", + external_file=["./videos/session01_cam1.avi"], # relative path + format="external", + rate=30.0, + starting_frame=[0], + description="Side-view behavioral camera", + unit="n.a.", +) +nwbfile.add_acquisition(video) +``` + +## Notes + +- Use **relative paths** for `external_file` so the NWB file remains portable. +- `starting_frame` is a list with one entry per file in `external_file`. +- For neural imaging data (two-photon, miniscope), store data **inside** the NWB file + using `TwoPhotonSeries`/`OnePhotonSeries`, not as external files. +- `GrayscaleImage` expects 2D arrays; `RGBImage` expects 3D with last dim = 3. diff --git a/src/pyflask/ai/skill/knowledge/pynwb-ophys-advanced.md b/src/pyflask/ai/skill/knowledge/pynwb-ophys-advanced.md new file mode 100644 index 000000000..ec381c518 --- /dev/null +++ b/src/pyflask/ai/skill/knowledge/pynwb-ophys-advanced.md @@ -0,0 +1,171 @@ +# Optical Physiology (ophys) — Advanced PyNWB Patterns + +Construction patterns beyond the basics in `nwb-best-practices.md`. + +## ImagingPlane + OpticalChannel + +```python +from pynwb.ophys import OpticalChannel + +device = nwbfile.create_device( + name="Microscope", + description="Two-photon resonant scanning microscope", + manufacturer="Bruker", +) + +optical_channel = OpticalChannel( + name="green", + description="GCaMP emission channel", + emission_lambda=520.0, +) + +imaging_plane = nwbfile.create_imaging_plane( + name="ImagingPlane", + optical_channel=optical_channel, + imaging_rate=30.0, + description="Imaging plane in CA1", + device=device, + excitation_lambda=920.0, + indicator="GCaMP6f", + location="CA1", + grid_spacing=[0.001, 0.001], # meters per pixel (1 um/px) + grid_spacing_unit="meters", +) +``` + +## TwoPhotonSeries vs OnePhotonSeries + +```python +from pynwb.ophys import TwoPhotonSeries, OnePhotonSeries + +# Two-photon (ScanImage, Scanbox, Bruker) +two_photon = TwoPhotonSeries( + name="TwoPhotonSeries", + data=image_data, # shape: (n_frames, height, width) + imaging_plane=imaging_plane, + rate=30.0, + unit="n.a.", +) +nwbfile.add_acquisition(two_photon) + +# One-photon / widefield (Miniscope, Inscopix, widefield) +one_photon = OnePhotonSeries( + name="OnePhotonSeries", + data=image_data, + imaging_plane=imaging_plane, + rate=30.0, + unit="n.a.", +) +nwbfile.add_acquisition(one_photon) +``` + +## PlaneSegmentation — ROI Masks + +Three mask formats are supported. Use the one that matches your segmentation output: + +**pixel_mask** — sparse format, best for small ROIs in large FOV: +```python +from pynwb.ophys import PlaneSegmentation, ImageSegmentation +from neuroconv.tools.nwb_helpers import get_module + +img_seg = ImageSegmentation() +ophys_module = get_module(nwbfile, "ophys", "Optical physiology data") +ophys_module.add(img_seg) + +plane_seg = img_seg.create_plane_segmentation( + name="PlaneSegmentation", + description="ROIs from Suite2p", + imaging_plane=imaging_plane, +) + +# Each ROI: list of (x, y, weight) tuples +for roi_mask in roi_masks: + plane_seg.add_roi(pixel_mask=roi_mask) + # roi_mask = [(x1, y1, w1), (x2, y2, w2), ...] +``` + +**image_mask** — dense format, one full-FOV mask per ROI: +```python +plane_seg = img_seg.create_plane_segmentation( + name="PlaneSegmentation", + description="ROIs from CaImAn", + imaging_plane=imaging_plane, +) + +for mask_2d in image_masks: + plane_seg.add_roi(image_mask=mask_2d) + # mask_2d shape: (height, width), same as imaging plane +``` + +## RoiResponseSeries — Fluorescence Traces + +```python +from pynwb.ophys import RoiResponseSeries, DfOverF, Fluorescence + +# Create a region referencing all (or some) ROIs +roi_table_region = plane_seg.create_roi_table_region( + region=list(range(n_rois)), + description="All ROIs", +) + +# Raw fluorescence +fluorescence = Fluorescence() +ophys_module.add(fluorescence) +fluorescence.create_roi_response_series( + name="RoiResponseSeries", + data=F, # shape: (n_frames, n_rois) + rois=roi_table_region, + rate=30.0, + unit="n.a.", +) + +# dF/F +dff = DfOverF() +ophys_module.add(dff) +dff.create_roi_response_series( + name="DfOverF", + data=dff_data, # shape: (n_frames, n_rois) + rois=roi_table_region, + rate=30.0, + unit="n.a.", +) +``` + +## MotionCorrection + +```python +from pynwb.ophys import MotionCorrection, CorrectedImageStack + +corrected = CorrectedImageStack( + corrected=corrected_two_photon, # TwoPhotonSeries (corrected data) + original=two_photon, # TwoPhotonSeries (original data) + xy_translation=TimeSeries( + name="xy_translation", + data=shifts, # shape: (n_frames, 2) — x,y shifts + rate=30.0, + unit="pixels", + ), +) + +motion_correction = MotionCorrection(corrected_image_stacks=[corrected]) +ophys_module.add(motion_correction) +``` + +## Multi-Plane Imaging + +For multi-plane imaging, create separate ImagingPlane, TwoPhotonSeries, and +PlaneSegmentation for each plane: + +```python +for plane_idx in range(n_planes): + ip = nwbfile.create_imaging_plane( + name=f"ImagingPlane{plane_idx}", + optical_channel=optical_channel, + imaging_rate=volume_rate, + device=device, + excitation_lambda=920.0, + indicator="GCaMP6f", + location=f"CA1_plane{plane_idx}", + ) + # Create TwoPhotonSeries and PlaneSegmentation per plane... +``` diff --git a/src/pyflask/ai/skill/knowledge/pynwb-optogenetics.md b/src/pyflask/ai/skill/knowledge/pynwb-optogenetics.md new file mode 100644 index 000000000..3f03623dc --- /dev/null +++ b/src/pyflask/ai/skill/knowledge/pynwb-optogenetics.md @@ -0,0 +1,73 @@ +# Optogenetics — PyNWB Patterns + +Construction patterns for optogenetic stimulation data. + +## Device + Stimulus Site + +```python +device = nwbfile.create_device( + name="Laser", + description="473nm DPSS laser for ChR2 activation", + manufacturer="Cobolt", +) + +ogen_site = nwbfile.create_ogen_site( + name="ogen_site", + device=device, + description="Fiber optic cannula targeting left mPFC", + excitation_lambda=473.0, # nm + location="mPFC", # brain region +) +``` + +## Optogenetic Series + +```python +from pynwb.ogen import OptogeneticSeries + +ogen_series = OptogeneticSeries( + name="optogenetic_stimulus", + data=laser_waveform, # power in watts (numpy array, shape: n_timepoints) + site=ogen_site, + rate=10000.0, # sampling rate of the stimulus waveform + unit="watts", + description="5ms pulses at 20Hz, 10mW", +) +nwbfile.add_stimulus(ogen_series) +``` + +For **event-based** stimulation (on/off times rather than continuous waveform): +```python +ogen_series = OptogeneticSeries( + name="optogenetic_stimulus", + data=pulse_amplitudes, # power at each pulse + timestamps=pulse_times, # time of each pulse in seconds + site=ogen_site, + unit="watts", +) +nwbfile.add_stimulus(ogen_series) +``` + +## Notes + +- Every `OptogeneticStimulusSite` must have at least one `OptogeneticSeries`. + Don't create sites without corresponding stimulus data. +- `excitation_lambda` is the wavelength in nm (e.g., 473 for ChR2, 590 for NpHR, + 635 for Chrimson). +- `location` should use standard brain region names (Allen Brain Atlas for mice). +- Store the stimulus waveform, not just on/off times, when available. + +## Metadata YAML Template + +```yaml +Ogen: + Device: + - name: Laser + description: 473nm DPSS laser + manufacturer: Cobolt + OptogeneticStimulusSite: + - name: ogen_site + description: Fiber optic cannula, 200um core, 0.39 NA + excitation_lambda: 473.0 + location: mPFC +``` diff --git a/src/pyflask/ai/skill/knowledge/repo-structure.md b/src/pyflask/ai/skill/knowledge/repo-structure.md new file mode 100644 index 000000000..c2fe737fe --- /dev/null +++ b/src/pyflask/ai/skill/knowledge/repo-structure.md @@ -0,0 +1,1436 @@ +# Canonical CatalystNeuro NWB Conversion Repo Structure + +This document is a practical reference for generating a new `-lab-to-nwb` conversion repository following the CatalystNeuro pattern established by the [cookiecutter-my-lab-to-nwb-template](https://github.com/catalystneuro/cookiecutter-my-lab-to-nwb-template). All code examples are drawn from real production repos (cai-lab-to-nwb, giocomo-lab-to-nwb). + +--- + +## 1. Directory Structure + +A conversion repo has this exact layout: + +``` +-lab-to-nwb/ +├── .github/ +│ └── workflows/ +│ ├── auto-publish.yml # PyPI publish on GitHub release +│ └── test-install.yml # Monthly CI: install + import test +├── .gitignore +├── .pre-commit-config.yaml # black, ruff, codespell, trailing whitespace +├── LICENSE # BSD-3 +├── README.md +├── make_env.yml # Conda environment definition +├── pyproject.toml # Build config, deps, tooling +└── src/ + └── _lab_to_nwb/ # Python package (underscored slug) + ├── __init__.py # Empty or minimal + ├── / # One directory per conversion/experiment + │ ├── __init__.py # Exports the NWBConverter and custom interfaces + │ ├── _nwbconverter.py + │ ├── _convert_session.py + │ ├── _convert_all_sessions.py + │ ├── _metadata.yaml + │ ├── .py + │ ├── .py + │ ├── interfaces/ # Optional: subdirectory if many interfaces + │ │ ├── __init__.py + │ │ ├── .py + │ │ └── .py + │ ├── utils/ # Optional: helper scripts + │ └── conversion_notes.md # Free-form notes about the conversion + └── / # Additional conversions for the same lab + └── ... +``` + +### Naming conventions + +| Concept | Convention | Example | +|---------|-----------|---------| +| Repo name | `-lab-to-nwb` | `cai-lab-to-nwb` | +| Package slug | `_lab_to_nwb` (underscored) | `cai_lab_to_nwb` | +| Conversion directory | `` or descriptive name | `zaki_2024`, `wen22` | +| NWBConverter class | `NWBConverter` | `Zaki2024NWBConverter` | +| Interface class | `Interface` | `Zaki2024ShockStimuliInterface` | +| Metadata file | `_metadata.yaml` | `zaki_2024_metadata.yaml` | +| Convert session script | `_convert_session.py` | `zaki_2024_convert_session.py` | +| Convert all script | `_convert_all_sessions.py` | `zaki_2024_convert_all_sessions.py` | + +### The `__init__.py` files + +The conversion-level `__init__.py` exports the key classes so they can be imported cleanly: + +```python +# src/cai_lab_to_nwb/zaki_2024/__init__.py +# (can be empty, or export key classes) +``` + +If you have an `interfaces/` subdirectory, its `__init__.py` re-exports everything: + +```python +# src/cai_lab_to_nwb/zaki_2024/interfaces/__init__.py +from .eztrack_interface import EzTrackFreezingBehaviorInterface +from .zaki_2024_edf_interface import Zaki2024EDFInterface, Zaki2024MultiEDFInterface +from .minian_interface import MinianSegmentationInterface, MinianMotionCorrectionInterface +from .zaki_2024_sleep_classification_interface import Zaki2024SleepClassificationInterface +from .miniscope_imaging_interface import MiniscopeImagingInterface +from .zaki_2024_shock_stimuli_interface import Zaki2024ShockStimuliInterface +from .zaki_2024_cell_registration_interface import Zaki2024CellRegistrationInterface +``` + +--- + +## 2. pyproject.toml + +The build system uses **hatchling** (the modern standard). Here is the canonical structure with all required fields: + +```toml +[project] +name = "-lab-to-nwb" +version = "0.0.1" +description = "NWB conversion scripts, functions, and classes for lab conversion" +readme = "README.md" +authors = [{ name = "CatalystNeuro", email = "ben.dichter@catalystneuro.com" }] +maintainers = [{ name = "CatalystNeuro", email = "ben.dichter@catalystneuro.com" }] +license = { file = "LICENSE" } +requires-python = ">=3.10" +classifiers = [ + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13" +] + +dependencies = [ + "neuroconv", + "nwbinspector", +] + +[project.urls] +Repository = "https://github.com/catalystneuro/-lab-to-nwb" + +# Per-conversion pinned dependencies (install with: pip install -e .[conversion_name]) +[project.optional-dependencies] + = [ + "neuroconv==0.7.0", # Pin to exact version used during development + # Add conversion-specific extras here, e.g.: + # "mne", + # "opencv-python-headless", + # "ndx-miniscope==0.5.1", +] + +[dependency-groups] +dev = [ + "pre-commit", + "ruff", +] + +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[tool.hatch.build] +include = [ + "*.yaml", + "*.yml", + "*.json", +] # Ensures metadata YAML files are included in sdist and wheel + +[tool.hatch.build.targets.wheel] +packages = ["src/_lab_to_nwb"] + +[tool.hatch.build.targets.sdist] +packages = ["src/_lab_to_nwb"] + +[tool.ruff] + +[tool.ruff.lint] +select = [ + "F401", # Unused import + "I", # All isort rules + "UP006", # non-pep585 annotation + "UP007", # non-pep604 annotation (Union -> |) + "UP045", # non-pep604 annotation (Optional -> | None) +] +fixable = ["ALL"] + +[tool.ruff.lint.isort] +relative-imports-order = "closest-to-furthest" +known-first-party = ["_lab_to_nwb"] + +[tool.codespell] +skip = '.git*,*.pdf,*.css' +check-hidden = true +ignore-words-list = 'assertin' +``` + +### Key points about dependencies + +- The top-level `dependencies` list should contain unpinned `neuroconv` and `nwbinspector` for broad compatibility. +- Per-conversion optional dependencies should **pin exact versions** so that a specific conversion remains reproducible. +- Conversion-specific extras (e.g., `mne` for EDF files, `opencv-python-headless` for video, NWB extension packages like `ndx-miniscope`) go in the optional dependencies section. + +### Real-world example (cai-lab-to-nwb) + +The cai-lab-to-nwb repo pins all its core dependencies because it has a single primary conversion: + +```toml +dependencies = [ + "pynwb==3.0.0", + "neuroconv==0.7.4", + "nwbinspector==0.6.3", + "roiextractors==0.5.13", + "ipykernel", + "openpyxl", + "mne", + "opencv-python-headless", + "ndx-miniscope==0.5.1", +] +``` + +--- + +## 3. NWBConverter Class + +The `NWBConverter` is the central orchestrator. It declares which `DataInterface` classes handle each data modality and wires them together. + +### The pattern + +```python +"""Primary NWBConverter class for this dataset.""" +from neuroconv import NWBConverter +from neuroconv.datainterfaces import ( + SpikeGLXRecordingInterface, + PhySortingInterface, +) + +from _lab_to_nwb..interfaces import ( + BehaviorInterface, +) + + +class NWBConverter(NWBConverter): + """Primary conversion class for .""" + + data_interface_classes = dict( + Recording=SpikeGLXRecordingInterface, + Sorting=PhySortingInterface, + Behavior=BehaviorInterface, + ) +``` + +### How to choose interfaces + +The `data_interface_classes` dict maps **arbitrary string keys** to interface classes. The keys become the keys you use in `source_data` and `conversion_options` dicts. Choose keys that describe the data modality clearly. + +Common built-in interfaces from `neuroconv.datainterfaces`: + +| Modality | Interface | When to use | +|----------|-----------|-------------| +| Neuropixels raw | `SpikeGLXRecordingInterface` | SpikeGLX .bin/.meta files | +| Neuropixels LFP | `SpikeGLXLFPInterface` | SpikeGLX LFP band | +| Spike sorting | `PhySortingInterface` | Phy/Kilosort output | +| Spike sorting | `KiloSortSortingInterface` | KiloSort output directly | +| Calcium imaging | `TiffImagingInterface` | TIFF stacks | +| Calcium segmentation | `Suite2pSegmentationInterface` | Suite2p output | +| Video | `VideoInterface` | Behavioral video files | +| Intracellular | `AbfInterface` | Axon Binary Format | +| EDF signals | Custom needed | EDF format | + +When no built-in interface exists for a data type, write a custom `BaseDataInterface` subclass (see Section 6). + +### Real-world example (cai-lab-to-nwb, zaki_2024) + +This converter has 10 data interfaces, mixing built-in and custom: + +```python +from neuroconv import NWBConverter +from neuroconv.datainterfaces import VideoInterface +from neuroconv.utils.dict import DeepDict +from datetime import timedelta + +from cai_lab_to_nwb.zaki_2024.interfaces import ( + MinianSegmentationInterface, + Zaki2024EDFInterface, + Zaki2024MultiEDFInterface, + EzTrackFreezingBehaviorInterface, + Zaki2024SleepClassificationInterface, + MiniscopeImagingInterface, + MinianMotionCorrectionInterface, + Zaki2024ShockStimuliInterface, + Zaki2024CellRegistrationInterface, +) + + +class Zaki2024NWBConverter(NWBConverter): + """Primary conversion class Cai Lab dataset.""" + + data_interface_classes = dict( + MiniscopeImaging=MiniscopeImagingInterface, + MinianSegmentation=MinianSegmentationInterface, + MinianMotionCorrection=MinianMotionCorrectionInterface, + SleepClassification=Zaki2024SleepClassificationInterface, + EDFSignals=Zaki2024EDFInterface, + MultiEDFSignals=Zaki2024MultiEDFInterface, + FreezingBehavior=EzTrackFreezingBehaviorInterface, + Video=VideoInterface, + ShockStimuli=Zaki2024ShockStimuliInterface, + CellRegistration=Zaki2024CellRegistrationInterface, + ) +``` + +### Overriding `get_metadata()` + +Override `get_metadata()` when you need to compute metadata that depends on the source data itself: + +```python +def get_metadata(self) -> DeepDict: + metadata = super().get_metadata() + + # Example: adjust session_start_time based on imaging timestamps + if "MiniscopeImaging" in self.data_interface_objects: + imaging_interface = self.data_interface_objects["MiniscopeImaging"] + imaging_timestamps = imaging_interface.get_original_timestamps() + if imaging_timestamps[0] < 0.0: + time_shift = timedelta(seconds=abs(imaging_timestamps[0])) + session_start_time = imaging_interface.get_metadata()["NWBFile"]["session_start_time"] + metadata["NWBFile"].update(session_start_time=session_start_time - time_shift) + + return metadata +``` + +### Not all interfaces must be present in every session + +The converter class declares the **superset** of all possible interfaces. In `convert_session.py`, you only add entries to `source_data` for interfaces that are relevant to that particular session. The converter will only instantiate interfaces that have entries in `source_data`. + +--- + +## 4. convert_session.py + +This is the script that converts a single session. It follows a strict pattern: + +1. Build `source_data` dict (file paths for each interface) +2. Build `conversion_options` dict (per-interface options like `stub_test`) +3. Instantiate the converter +4. Get auto-extracted metadata, layer on YAML metadata, layer on session-specific metadata +5. Call `converter.run_conversion()` + +### The canonical pattern + +```python +"""Primary script to run to convert an entire session of data using the NWBConverter.""" +from pathlib import Path +from typing import Union +from datetime import datetime +from zoneinfo import ZoneInfo + +from neuroconv.utils import load_dict_from_file, dict_deep_update + +from _lab_to_nwb.._nwbconverter import NWBConverter + + +def session_to_nwb( + data_dir_path: Union[str, Path], + output_dir_path: Union[str, Path], + stub_test: bool = False, +): + data_dir_path = Path(data_dir_path) + output_dir_path = Path(output_dir_path) + if stub_test: + output_dir_path = output_dir_path / "nwb_stub" + output_dir_path.mkdir(parents=True, exist_ok=True) + + session_id = "subject_session_identifier" + nwbfile_path = output_dir_path / f"{session_id}.nwb" + + # ---- Step 1: Build source_data and conversion_options ---- + source_data = dict() + conversion_options = dict() + + # Add Recording + source_data.update(dict(Recording=dict( + file_path=str(data_dir_path / "recording.ap.bin"), + ))) + conversion_options.update(dict(Recording=dict(stub_test=stub_test))) + + # Add Sorting + source_data.update(dict(Sorting=dict( + folder_path=str(data_dir_path / "sorting"), + ))) + conversion_options.update(dict(Sorting=dict())) + + # Add Behavior (custom interface) + source_data.update(dict(Behavior=dict( + file_path=str(data_dir_path / "behavior.csv"), + ))) + conversion_options.update(dict(Behavior=dict())) + + # ---- Step 2: Instantiate converter ---- + converter = NWBConverter(source_data=source_data) + + # ---- Step 3: Build metadata (layered) ---- + # Layer 1: Auto-extracted from source files + metadata = converter.get_metadata() + + # Layer 2: Set session_start_time with timezone + session_start_time = datetime(year=2020, month=1, day=1, tzinfo=ZoneInfo("US/Eastern")) + metadata["NWBFile"]["session_start_time"] = session_start_time + + # Layer 3: Merge in the hand-edited YAML metadata + editable_metadata_path = Path(__file__).parent / "_metadata.yaml" + editable_metadata = load_dict_from_file(editable_metadata_path) + metadata = dict_deep_update(metadata, editable_metadata) + + # Layer 4: Session-specific overrides + metadata["Subject"]["subject_id"] = "mouse001" + metadata["NWBFile"]["session_id"] = session_id + + # ---- Step 4: Run conversion ---- + converter.run_conversion( + metadata=metadata, + nwbfile_path=nwbfile_path, + conversion_options=conversion_options, + overwrite=True, + ) + + +if __name__ == "__main__": + data_dir_path = Path("/path/to/raw/data/") + output_dir_path = Path("~/conversion_nwb/") + stub_test = False + + session_to_nwb( + data_dir_path=data_dir_path, + output_dir_path=output_dir_path, + stub_test=stub_test, + ) +``` + +### Metadata layering order + +This is critical. Later layers override earlier ones: + +1. **Auto-extracted** (`converter.get_metadata()`): Reads metadata from the source files themselves (e.g., sampling rate from SpikeGLX .meta files, session_start_time from file timestamps). +2. **session_start_time with timezone**: Must always be set explicitly with a timezone. Use `ZoneInfo` (Python 3.9+) or `pytz`. +3. **YAML file** (`dict_deep_update` with loaded YAML): Lab-level metadata that applies to all sessions of this conversion (institution, lab, experimenter, species, publications, etc.). +4. **Session-specific overrides**: `subject_id`, `session_id`, `session_description`, etc. that vary per session. + +### Real-world example (cai-lab-to-nwb, zaki_2024) + +The real convert_session.py shows the pattern with conditional interface inclusion (not all sessions have all data types): + +```python +def session_to_nwb( + output_dir_path: Union[str, Path], + subject_id: str, + session_id: str, + date_str: str, + time_str: str, + session_description: str, + stub_test: bool = False, + overwrite: bool = False, + verbose: bool = False, + imaging_folder_path: Union[str, Path] = None, + minian_folder_path: Union[str, Path] = None, + video_file_path: Union[str, Path] = None, + freezing_output_file_path: Union[str, Path] = None, + edf_file_path: Union[str, Path] = None, + sleep_classification_file_path: Union[str, Path] = None, + shock_stimulus: dict = None, +): + # ... + source_data = dict() + conversion_options = dict() + + # Conditionally add interfaces based on what data is available + if imaging_folder_path: + imaging_folder_path = Path(imaging_folder_path) + source_data.update(dict(MiniscopeImaging=dict(folder_path=imaging_folder_path))) + conversion_options.update(dict(MiniscopeImaging=dict(stub_test=stub_test))) + + if minian_folder_path: + minian_folder_path = Path(minian_folder_path) + source_data.update(dict(MinianSegmentation=dict(folder_path=minian_folder_path))) + conversion_options.update(dict(MinianSegmentation=dict(stub_test=stub_test))) + + if video_file_path: + source_data.update(dict(Video=dict(file_paths=[video_file_path]))) + conversion_options.update(dict(Video=dict(stub_test=stub_test))) + + if shock_stimulus is not None: + source_data.update(ShockStimuli=dict()) + conversion_options.update(ShockStimuli=shock_stimulus) + + converter = Zaki2024NWBConverter(source_data=source_data, verbose=verbose) + metadata = converter.get_metadata() + + # Timezone localization + eastern = pytz.timezone("US/Eastern") + metadata["NWBFile"]["session_start_time"] = eastern.localize( + metadata["NWBFile"]["session_start_time"] + ) + + # YAML metadata layer + editable_metadata_path = Path(__file__).parent / "zaki_2024_metadata.yaml" + editable_metadata = load_dict_from_file(editable_metadata_path) + metadata = dict_deep_update(metadata, editable_metadata) + + # Session-specific metadata + metadata["Subject"]["subject_id"] = subject_id + metadata["NWBFile"]["session_description"] = session_description + metadata["NWBFile"]["session_id"] = session_id + + converter.run_conversion( + metadata=metadata, + nwbfile_path=nwbfile_path, + conversion_options=conversion_options, + overwrite=overwrite, + ) +``` + +### The `stub_test` pattern + +The `stub_test` parameter is a convention that: +- Redirects output to a `nwb_stub/` subdirectory +- Gets passed to each interface's `conversion_options` so they only write a small subset of data (e.g., first few seconds of recording) +- Enables fast iteration during development without writing full datasets + +```python +if stub_test: + output_dir_path = output_dir_path / "nwb_stub" +# ... +conversion_options.update(dict(Recording=dict(stub_test=stub_test))) +``` + +### NWB file naming + +Use descriptive, BIDS-like naming: `sub-_ses-.nwb` or simply `.nwb`. + +--- + +## 5. convert_all_sessions.py + +This script handles batch conversion of all sessions in a dataset. It follows a template pattern with three functions: + +### The canonical pattern + +```python +"""Primary script to run to convert all sessions in a dataset using session_to_nwb.""" +from pathlib import Path +from typing import Union +from concurrent.futures import ProcessPoolExecutor, as_completed +from pprint import pformat +import traceback +from tqdm import tqdm + +from .convert_session import session_to_nwb + + +def dataset_to_nwb( + *, + data_dir_path: Union[str, Path], + output_dir_path: Union[str, Path], + max_workers: int = 1, + verbose: bool = True, + stub_test: bool = False, +): + """Convert the entire dataset to NWB. + + Parameters + ---------- + data_dir_path : Union[str, Path] + The path to the directory containing the raw data. + output_dir_path : Union[str, Path] + The path to the directory where the NWB files will be saved. + max_workers : int, optional + The number of workers to use for parallel processing, by default 1 + verbose : bool, optional + Whether to print verbose output, by default True + stub_test : bool, optional + Whether to run in stub test mode, by default False + """ + data_dir_path = Path(data_dir_path) + output_dir_path = Path(output_dir_path) + session_to_nwb_kwargs_per_session = get_session_to_nwb_kwargs_per_session( + data_dir_path=data_dir_path, + ) + + futures = [] + with ProcessPoolExecutor(max_workers=max_workers) as executor: + for session_to_nwb_kwargs in session_to_nwb_kwargs_per_session: + session_to_nwb_kwargs["output_dir_path"] = output_dir_path + session_to_nwb_kwargs["verbose"] = verbose + session_to_nwb_kwargs["stub_test"] = stub_test + exception_file_path = ( + data_dir_path / f"ERROR_{session_to_nwb_kwargs.get('session_id', 'unknown')}.txt" + ) + futures.append( + executor.submit( + safe_session_to_nwb, + session_to_nwb_kwargs=session_to_nwb_kwargs, + exception_file_path=exception_file_path, + ) + ) + for _ in tqdm(as_completed(futures), total=len(futures)): + pass + + +def safe_session_to_nwb( + *, + session_to_nwb_kwargs: dict, + exception_file_path: Union[Path, str], +): + """Convert a session to NWB while handling any errors by writing to exception_file_path.""" + exception_file_path = Path(exception_file_path) + try: + session_to_nwb(**session_to_nwb_kwargs) + except Exception as e: + with open(exception_file_path, mode="w") as f: + f.write(f"session_to_nwb_kwargs: \n {pformat(session_to_nwb_kwargs)}\n\n") + f.write(traceback.format_exc()) + + +def get_session_to_nwb_kwargs_per_session( + *, + data_dir_path: Union[str, Path], +): + """Get the kwargs for session_to_nwb for each session in the dataset. + + Returns + ------- + list[dict[str, Any]] + A list of dictionaries containing the kwargs for session_to_nwb for each session. + """ + # IMPLEMENT THIS: Return a list of dicts, each containing the kwargs for one session. + # Common strategies: + # 1. Iterate over session directories: list(data_dir_path.iterdir()) + # 2. Read from a spreadsheet/CSV with session metadata + # 3. Load from a pre-computed YAML parameters file + raise NotImplementedError + + +if __name__ == "__main__": + data_dir_path = Path("/path/to/raw/data/") + output_dir_path = Path("~/conversion_nwb/") + max_workers = 1 + stub_test = False + + dataset_to_nwb( + data_dir_path=data_dir_path, + output_dir_path=output_dir_path, + max_workers=max_workers, + stub_test=stub_test, + ) +``` + +### Key design decisions + +- **`ProcessPoolExecutor`**: Enables parallel conversion of sessions. Default `max_workers=1` for sequential processing. +- **`safe_session_to_nwb`**: Wraps `session_to_nwb` in a try/except that writes errors to a file instead of crashing the batch. This is critical for large datasets. +- **`get_session_to_nwb_kwargs_per_session`**: This is the function that must be customized per conversion. It returns a list of dicts, where each dict contains exactly the kwargs needed by `session_to_nwb`. + +### Real-world example of `get_session_to_nwb_kwargs_per_session` (cai-lab-to-nwb) + +```python +def get_session_to_nwb_kwargs_per_session(*, data_dir_path): + import pandas as pd + subjects_df = pd.read_excel(data_dir_path / "Ca_EEG_Design.xlsx") + subjects = subjects_df["Mouse"] + session_to_nwb_kwargs_per_session = [] + + for subject_id in subjects: + yaml_file_path = Path(__file__).parent / "utils/conversion_parameters.yaml" + conversion_parameter_dict = load_dict_from_file(yaml_file_path) + if subject_id in conversion_parameter_dict: + for session_id in conversion_parameter_dict[subject_id].keys(): + session_to_nwb_kwargs_per_session.append( + conversion_parameter_dict[subject_id][session_id] + ) + + return session_to_nwb_kwargs_per_session +``` + +### Real-world example of iterating over directories (giocomo-lab-to-nwb wen22) + +The wen22 conversion uses a simpler pattern -- iterating directly over session directories: + +```python +session_path_list = [path for path in data_path.iterdir() if path.name != "VR"] +for session_path in session_path_list: + session_id = session_path.name + # ... build source_data from session_path ... + converter = Wen21NWBConverter(source_data=source_data) + # ... run conversion ... +``` + +--- + +## 6. Custom DataInterface + +When no built-in NeuroConv interface exists for a data type, write a custom one by subclassing `BaseDataInterface`. This is the most common customization point. + +### The pattern + +```python +"""Primary class for converting experiment-specific .""" +from pynwb.file import NWBFile + +from neuroconv.basedatainterface import BaseDataInterface +from neuroconv.utils import DeepDict + + +class Interface(BaseDataInterface): + """ interface for conversion.""" + + keywords = ["behavior"] # Used for discoverability + + def __init__(self, file_path: str, verbose: bool = False): + # Load data LAZILY -- do not read entire files here. + # Store paths and parameters as instance attributes. + # Call super().__init__() to register source_data. + self.file_path = file_path + self.verbose = verbose + super().__init__(file_path=file_path) + + def get_metadata(self) -> DeepDict: + # Extract metadata from source files that can be auto-detected. + # Return a DeepDict (nested dict) matching the NWB metadata schema. + metadata = super().get_metadata() + # Example: metadata["NWBFile"]["session_start_time"] = + return metadata + + def add_to_nwbfile(self, nwbfile: NWBFile, metadata: dict, **conversion_options): + # The core method. Read data from source files and add to the NWBFile. + # conversion_options come from the conversion_options dict passed to run_conversion. + raise NotImplementedError() +``` + +### Critical details about `__init__` + +- The `__init__` method's parameters become the keys in the `source_data` dict. +- Call `super().__init__()` and pass all the init parameters as keyword arguments. This stores them in `self.source_data` for later reference. +- Use type hints from `pydantic` for validation: `FilePath`, `DirectoryPath`. + +```python +from pydantic import FilePath + +class MyInterface(BaseDataInterface): + def __init__(self, file_path: FilePath, sampling_frequency: float, verbose: bool = False): + self.file_path = file_path + self.verbose = verbose + self.sampling_frequency = sampling_frequency + super().__init__(file_path=file_path, sampling_frequency=sampling_frequency) +``` + +Then in `source_data`: +```python +source_data["MyModality"] = dict(file_path="/path/to/file.csv", sampling_frequency=30000.0) +``` + +### Critical details about `add_to_nwbfile` + +- The method signature is `add_to_nwbfile(self, nwbfile: NWBFile, metadata: dict, **kwargs)`. +- Extra keyword arguments in the method signature correspond to keys in `conversion_options`. +- You can include `stub_test: bool = False` to support the stub test pattern. +- Use processing modules for derived data (see `get_module` in Section 9). + +### Real-world example: Simple interface (Zaki2024ShockStimuliInterface) + +This interface takes no source files -- the data is passed entirely through `conversion_options`: + +```python +from pynwb.file import NWBFile +from pynwb.epoch import TimeIntervals +from neuroconv.basedatainterface import BaseDataInterface +from neuroconv.utils import DeepDict +from typing import Optional + + +class Zaki2024ShockStimuliInterface(BaseDataInterface): + """Adds annotated events of shock times.""" + + keywords = ["behavior", "sleep stages"] + + def __init__(self, verbose: bool = False): + self.verbose = verbose + super().__init__() + + def get_metadata(self) -> DeepDict: + metadata = super().get_metadata() + return metadata + + def add_to_nwbfile( + self, + nwbfile: NWBFile, + shock_amplitude: float, + shock_times: list, + shock_duration: float, + metadata: Optional[dict] = None, + ): + description = ( + "During aversive encoding, after a baseline period of 2 min, " + "mice received three 2 s foot shocks..." + ) + shock_stimuli = TimeIntervals(name="ShockStimuli", description=description) + shock_stimuli.add_column(name="shock_amplitude", description="Shock amplitude in mA") + for start_time in shock_times: + shock_stimuli.add_interval( + start_time=start_time, + stop_time=start_time + shock_duration, + shock_amplitude=shock_amplitude, + ) + nwbfile.add_stimulus(shock_stimuli) +``` + +The corresponding `conversion_options` in the convert_session.py: +```python +conversion_options.update( + ShockStimuli=dict( + shock_times=[120.0, 180.0, 240.0], + shock_amplitude=1.5, + shock_duration=2.0, + ), +) +``` + +### Real-world example: Complex interface with temporal alignment (EzTrackFreezingBehaviorInterface) + +This interface reads data from a CSV file, supports temporal alignment, and writes both a TimeSeries and TimeIntervals: + +```python +import numpy as np +import pandas as pd +from pynwb import TimeSeries +from pynwb.epoch import TimeIntervals +from pynwb.file import NWBFile +from neuroconv.basedatainterface import BaseDataInterface +from neuroconv.utils import DeepDict +from pydantic import FilePath +from typing import Optional, List + + +class EzTrackFreezingBehaviorInterface(BaseDataInterface): + """Adds intervals of freezing behavior and motion series.""" + + keywords = ["behavior", "freezing", "motion"] + + def __init__(self, file_path: FilePath, video_sampling_frequency: float, verbose: bool = False): + self.file_path = file_path + self.verbose = verbose + self.video_sampling_frequency = video_sampling_frequency + # Private attributes for temporal alignment + self._start_times = None + self._stop_times = None + self._starting_time = None + + def get_metadata(self) -> DeepDict: + metadata = super().get_metadata() + return metadata + + def get_interval_times(self): + """Extract start and stop times of freezing events.""" + freezing_behavior_df = pd.read_csv(self.file_path) + freezing_values = freezing_behavior_df["Freezing"].values + changes_in_freezing = np.diff(freezing_values) + freezing_start = np.where(changes_in_freezing == 100)[0] + 1 + freezing_stop = np.where(changes_in_freezing == -100)[0] + 1 + + start_frames = freezing_behavior_df["Frame"].values[freezing_start] + stop_frames = freezing_behavior_df["Frame"].values[freezing_stop] + + # Use aligned times if set, otherwise compute from frames + start_times = ( + self._start_times if self._start_times is not None + else start_frames / self.video_sampling_frequency + ) + stop_times = ( + self._stop_times if self._stop_times is not None + else stop_frames / self.video_sampling_frequency + ) + return start_times, stop_times + + def set_aligned_interval_times(self, start_times, stop_times): + self._start_times = start_times + self._stop_times = stop_times + + def set_aligned_starting_time(self, aligned_start_time): + self._starting_time = aligned_start_time + + def add_to_nwbfile(self, nwbfile: NWBFile, metadata: Optional[dict] = None, stub_test: bool = False): + freezing_behavior_df = pd.read_csv(self.file_path) + start_times, stop_times = self.get_interval_times() + + motion_data = freezing_behavior_df["Motion"].values + starting_time = self._starting_time if self._starting_time is not None else self.get_starting_time() + + motion_series = TimeSeries( + name="MotionSeries", + description="Motion measured by pixel change between frames.", + data=motion_data[:100] if stub_test else motion_data, + unit="n.a", + starting_time=starting_time, + rate=self.video_sampling_frequency, + ) + + freeze_intervals = TimeIntervals(name="FreezingIntervals", description="...") + for start_time, stop_time in zip(start_times, stop_times): + freeze_intervals.add_interval( + start_time=start_time, + stop_time=stop_time, + timeseries=[motion_series], + ) + + if "behavior" not in nwbfile.processing: + behavior_module = nwbfile.create_processing_module( + name="behavior", description="Contains behavior data" + ) + else: + behavior_module = nwbfile.processing["behavior"] + + behavior_module.add(motion_series) + behavior_module.add(freeze_intervals) +``` + +### Real-world example: Complex interface with sync channel (Wen21EventsInterface) + +This interface demonstrates reading NI-DAQ sync channels to compute behavioral timestamp offsets: + +```python +from nwb_conversion_tools.basedatainterface import BaseDataInterface +from nwb_conversion_tools.utils.types import FolderPathType +from nwb_conversion_tools.tools.nwb_helpers import get_module +from hdmf.backends.hdf5.h5_utils import H5DataIO +from pynwb.behavior import Position, SpatialSeries +from pynwb import NWBFile, TimeSeries + + +class Wen21EventsInterface(BaseDataInterface): + def __init__(self, session_path: FolderPathType): + super().__init__(session_path=session_path) + + def run_conversion(self, nwbfile: NWBFile, metadata: dict): + behavior_module = get_module(nwbfile, "behavior") + session_path = Path(self.source_data["session_path"]) + + # ... read position files, compute temporal offset from NIDQ sync channel ... + + # Add position data with compression + spatial_series_object = SpatialSeries( + name="position", + description="position within the virtual reality wheel", + data=H5DataIO(position_data, compression="gzip"), + reference_frame="unknown", + unit="m", + conversion=0.01, + timestamps=position_timestamps, + ) + + pos_obj = Position(name="position within the virtual reality wheel") + pos_obj.add_spatial_series(spatial_series_object) + behavior_module.add_data_interface(pos_obj) +``` + +Note: The older `nwb_conversion_tools` API used `run_conversion()` instead of `add_to_nwbfile()`. Modern NeuroConv uses `add_to_nwbfile()`. + +--- + +## 7. metadata.yaml + +The metadata YAML file contains hand-edited metadata that applies to all sessions of a conversion. It is loaded in `convert_session.py` and merged on top of auto-extracted metadata. + +### Structure and required fields + +```yaml +NWBFile: + keywords: + - hippocampus + - learning + - memory + related_publications: + - https://doi.org/10.1038/s41586-024-08168-4 + session_description: > + A rich text description of the experiment. Can also just be the abstract + of the publication. This is REQUIRED by NWB. + experiment_description: > + Optional longer description of the experimental protocol. + institution: Icahn School of Medicine at Mount Sinai + lab: Cai + experimenter: + - Last, First Middle + - Last, First Middle + surgery: > + Optional: description of surgical procedures. + virus: > + Optional: description of viral constructs used. +Subject: + species: Mus musculus # REQUIRED. Use Latin binomial name. + description: > + A rich text description of the subject. + age: P12W/P18W # ISO 8601 duration. "P90D" = 90 days old. + sex: M # One of M, F, U, or O + strain: C57BL/6J # Optional + genotype: wild-type # Optional + date_of_birth: 2014-06-22 00:00:00-04:00 # Optional, with timezone +``` + +### How metadata merging works + +The `dict_deep_update` function performs a recursive merge. For nested dicts, keys are merged. For lists, the entire list is replaced (not appended). For scalar values, the later value wins. + +```python +from neuroconv.utils import load_dict_from_file, dict_deep_update + +# Auto-extracted metadata (from file headers, etc.) +metadata = converter.get_metadata() +# Example: metadata["NWBFile"]["session_start_time"] is already set from file timestamps + +# YAML metadata overlays on top +editable_metadata = load_dict_from_file(Path(__file__).parent / "metadata.yaml") +metadata = dict_deep_update(metadata, editable_metadata) +# Now metadata["NWBFile"]["lab"], ["institution"], etc. are set from the YAML +# But session_start_time from auto-extraction is preserved (YAML doesn't override it) + +# Session-specific overrides +metadata["Subject"]["subject_id"] = "mouse001" # Per-session value +``` + +### Extended metadata for specific modalities + +For optical physiology, the metadata YAML can also define imaging planes, optical channels, etc.: + +```yaml +Ophys: + OnePhotonSeries: + - name: OnePhotonSeries + description: Imaging data from Miniscope. + imaging_plane: ImagingPlane + unit: n.a. + ImagingPlane: + - name: ImagingPlane + description: Imaging plane for Miniscope imaging data. + excitation_lambda: 496.0 + location: CA1 + device: Microscope + optical_channel: + - name: GreenChannel + description: Green channel of the microscope. + emission_lambda: 513.0 + indicator: GCaMP6f +``` + +### Per-subject metadata + +For datasets with multiple subjects, you can use a separate YAML file for subject-specific metadata: + +```yaml +# subject_metadata.yml (from giocomo wen22) +N2: + subject_id: N2 + age: P90D + strain: C57Bl/6 + genotype: wildtype + date_of_birth: 2019-10-22 + weight: 0.016 + sex: U +``` + +Then load and merge per subject: +```python +subject_metadata_from_yaml = load_dict_from_file(Path("./subject_metadata.yml")) +subject_metadata = subject_metadata_from_yaml[subject_id] +metadata["Subject"] = dict_deep_update(metadata["Subject"], subject_metadata) +``` + +--- + +## 8. Temporal Alignment + +When multiple data streams have different clocks or start times, you must align them. This is done by overriding `temporally_align_data_interfaces()` in the NWBConverter. + +### The pattern + +```python +class MyNWBConverter(NWBConverter): + data_interface_classes = dict(...) + + def temporally_align_data_interfaces(self, metadata=None, conversion_options=None): + """Align all data streams to a common time reference.""" + + # Access interfaces by their keys + if "Recording" in self.data_interface_objects: + recording_interface = self.data_interface_objects["Recording"] + # Get original timestamps + original_timestamps = recording_interface.get_original_timestamps() + # Apply a shift + recording_interface.set_aligned_timestamps(original_timestamps + time_shift) + # Or set just the starting time + recording_interface.set_aligned_starting_time(new_start_time) +``` + +### Real-world example (cai-lab-to-nwb, zaki_2024) + +This is the most comprehensive temporal alignment example available. It handles the case where imaging timestamps start before zero (negative timestamps): + +```python +def temporally_align_data_interfaces(self, metadata=None, conversion_options=None): + if "MiniscopeImaging" in self.data_interface_objects: + imaging_interface = self.data_interface_objects["MiniscopeImaging"] + imaging_timestamps = imaging_interface.get_original_timestamps() + + if imaging_timestamps[0] < 0.0: + time_shift = abs(imaging_timestamps[0]) + + # Shift imaging timestamps + imaging_interface.set_aligned_timestamps(imaging_timestamps + time_shift) + + # Shift segmentation timestamps + if "MinianSegmentation" in self.data_interface_objects: + seg_interface = self.data_interface_objects["MinianSegmentation"] + seg_timestamps = seg_interface.get_original_timestamps() + seg_interface.set_aligned_timestamps(seg_timestamps + time_shift) + + # Shift sleep classification intervals + if "SleepClassification" in self.data_interface_objects: + sleep_interface = self.data_interface_objects["SleepClassification"] + start_times, stop_times, states = sleep_interface.get_sleep_states_times() + start_times += time_shift + stop_times += time_shift + sleep_interface.set_aligned_interval_times( + start_times=start_times, stop_times=stop_times + ) + + # Shift EDF starting time + if "EDFSignals" in self.data_interface_objects: + edf_interface = self.data_interface_objects["EDFSignals"] + edf_interface.set_aligned_starting_time(time_shift) + + # Shift freezing behavior + if "FreezingBehavior" in self.data_interface_objects: + fb_interface = self.data_interface_objects["FreezingBehavior"] + start_times, stop_times = fb_interface.get_interval_times() + fb_interface.set_aligned_interval_times( + start_times=start_times + time_shift, + stop_times=stop_times + time_shift, + ) + starting_time = fb_interface.get_starting_time() + fb_interface.set_aligned_starting_time(starting_time + time_shift) + + # Shift video timestamps + if "Video" in self.data_interface_objects: + video_interface = self.data_interface_objects["Video"] + video_timestamps = video_interface.get_original_timestamps() + video_interface.set_aligned_timestamps(video_timestamps + time_shift) +``` + +### Real-world example: Sync channel alignment (giocomo wen22) + +The wen22 conversion computes an offset between behavioral timestamps and neural recording timestamps using an NI-DAQ sync channel: + +```python +def calculate_behavioral_offset_with_nidq_channel(self, df_epochs): + """Calculate offset between behavioral timestamps and NIDQ sync pulses.""" + session_path = Path(self.source_data["session_path"]) + nidq_file_path = session_path / f"{session_path.stem.replace('g0', 'g0_t0')}.nidq.bin" + + if nidq_file_path.is_file(): + nidq_extractor = SpikeGLXRecordingExtractor(session_path, stream_id="nidq") + epoch_change_trace = nidq_extractor.get_traces(channel_ids=["nidq#XA2"]).ravel() + times = nidq_extractor.get_times() + + # Binarize the sync signal + epoch_change_trace_bin = np.zeros(epoch_change_trace.shape, dtype=int) + epoch_change_trace_bin[epoch_change_trace > (np.max(epoch_change_trace) // 2)] = 1 + epoch_start_idxs = np.where(np.diff(epoch_change_trace_bin) > 0)[0] + + df_epochs["epoch_start_by_niqd"] = times[epoch_start_idxs][:df_epochs.shape[0]] + offset = (df_epochs["start_time"] - df_epochs["epoch_start_by_niqd"]).mean() + return offset + return 0 +``` + +Then all behavioral timestamps are shifted by this offset: +```python +df_position_data["timestamps"] -= offset_for_behavioral_time_stamps +``` + +### Alignment API summary + +| Method | When to use | +|--------|-------------| +| `interface.get_original_timestamps()` | Get timestamps before any alignment | +| `interface.set_aligned_timestamps(timestamps)` | Replace all timestamps | +| `interface.set_aligned_starting_time(t)` | Shift starting time for regularly sampled data | +| `interface.set_aligned_interval_times(start_times, stop_times)` | Custom method for interval-based interfaces | + +--- + +## 9. Common Utilities + +### `load_dict_from_file` + +Loads YAML or JSON files into a Python dict: + +```python +from neuroconv.utils import load_dict_from_file + +metadata = load_dict_from_file(Path("metadata.yaml")) +``` + +### `dict_deep_update` + +Recursively merges two dicts. The second dict's values override the first's: + +```python +from neuroconv.utils import dict_deep_update + +base = {"NWBFile": {"lab": "old", "institution": "MIT"}} +override = {"NWBFile": {"lab": "new"}} +result = dict_deep_update(base, override) +# result = {"NWBFile": {"lab": "new", "institution": "MIT"}} +``` + +### `H5DataIO` + +Wraps numpy arrays for HDF5 compression. Use this for large data arrays: + +```python +from hdmf.backends.hdf5.h5_utils import H5DataIO + +spatial_series = SpatialSeries( + name="position", + data=H5DataIO(position_data, compression="gzip"), + timestamps=timestamps, + reference_frame="unknown", + unit="m", +) +``` + +### `get_module` + +Gets or creates a processing module in an NWB file: + +```python +from neuroconv.tools.nwb_helpers import get_module + +# Gets existing "behavior" module or creates it +behavior_module = get_module(nwbfile, "behavior") + +# Then add data interfaces to it +behavior_module.add(my_time_series) +``` + +Or create manually: +```python +if "behavior" not in nwbfile.processing: + behavior_module = nwbfile.create_processing_module( + name="behavior", description="Contains behavior data" + ) +else: + behavior_module = nwbfile.processing["behavior"] +``` + +### `DeepDict` + +The metadata type used throughout NeuroConv. Behaves like a nested defaultdict: + +```python +from neuroconv.utils import DeepDict + +metadata = DeepDict() +metadata["NWBFile"]["lab"] = "My Lab" # Auto-creates nested structure +``` + +--- + +## 10. Testing Patterns + +### stub_test + +The primary testing mechanism during development. Every `session_to_nwb` function should accept `stub_test: bool`: + +```python +def session_to_nwb(..., stub_test: bool = False): + if stub_test: + output_dir_path = output_dir_path / "nwb_stub" + # ... + conversion_options.update(dict(Recording=dict(stub_test=stub_test))) +``` + +Run it: +```python +session_to_nwb(data_dir_path=data_dir_path, output_dir_path=output_dir_path, stub_test=True) +``` + +This produces a small NWB file (usually a few MB) that can be quickly inspected. + +### nwbinspector + +After conversion, validate with nwbinspector: + +```bash +# Command line +nwbinspector /path/to/output.nwb + +# Or in Python +from nwbinspector import inspect_nwbfile +results = list(inspect_nwbfile(nwbfile_path="/path/to/output.nwb")) +for result in results: + print(result) +``` + +Common issues nwbinspector catches: +- Missing required fields (session_description, session_start_time, identifier) +- Timezone-naive datetimes (session_start_time must have timezone) +- Subject fields not matching controlled vocabularies +- Data without units +- Empty containers + +### CI test (test-install.yml) + +The GitHub Actions workflow tests that the package can be installed and imported: + +```yaml +name: Installation +on: + workflow_dispatch: + schedule: + - cron: "0 0 1 * *" # Monthly + +jobs: + run: + runs-on: ${{ matrix.os }} + strategy: + matrix: + os: ["ubuntu-latest", "macos-latest", "windows-latest"] + python-version: ["3.12"] + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + - run: pip install -e . + - run: python -c "import _lab_to_nwb" +``` + +### Manual validation workflow + +1. Run `session_to_nwb()` with `stub_test=True` +2. Open the stub NWB file with `pynwb` or NWB Widgets to visually inspect +3. Run `nwbinspector` on the stub file +4. Fix any issues +5. Run `session_to_nwb()` with `stub_test=False` on one real session +6. Run `nwbinspector` on the full file +7. Run `dataset_to_nwb()` for batch conversion + +--- + +## Appendix A: Supporting Files + +### make_env.yml + +```yaml +name: _lab_to_nwb_env +channels: +- conda-forge +- defaults +dependencies: +- python>=3.11 +- pip +- pip: + - --editable . +``` + +### .pre-commit-config.yaml + +```yaml +repos: +- repo: https://github.com/pre-commit/pre-commit-hooks + rev: v5.0.0 + hooks: + - id: check-yaml + - id: end-of-file-fixer + - id: trailing-whitespace + +- repo: https://github.com/psf/black + rev: 25.1.0 + hooks: + - id: black + exclude: ^docs/ + +- repo: https://github.com/astral-sh/ruff-pre-commit + rev: v0.11.2 + hooks: + - id: ruff + args: [ --fix ] + +- repo: https://github.com/codespell-project/codespell + rev: v2.4.1 + hooks: + - id: codespell + additional_dependencies: + - tomli +``` + +### auto-publish.yml + +```yaml +name: Upload Package to PyPI +on: + release: + types: [published] +jobs: + deploy: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: "3.11" + - run: | + python -m pip install --upgrade pip build + python -m build + - uses: pypa/gh-action-pypi-publish@v1.4.2 + with: + verbose: true + user: __token__ + password: ${{ secrets.PYPI_API_TOKEN }} +``` + +--- + +## Appendix B: Checklist for Generating a New Repo + +1. Create the directory structure as shown in Section 1 +2. Generate `pyproject.toml` with hatchling build system and correct package name +3. Create `make_env.yml`, `.pre-commit-config.yaml`, `.gitignore` +4. Copy the GitHub Actions workflows (`test-install.yml`, `auto-publish.yml`) +5. Write the `metadata.yaml` with all known lab/experiment metadata +6. Identify which built-in NeuroConv interfaces match each data modality +7. Write custom `BaseDataInterface` subclasses for data types without built-in interfaces +8. Write the `NWBConverter` class with all interfaces in `data_interface_classes` +9. If temporal alignment is needed, override `temporally_align_data_interfaces()` +10. Write `convert_session.py` following the source_data / conversion_options / metadata layering pattern +11. Write `convert_all_sessions.py` with the ProcessPoolExecutor pattern +12. Test with `stub_test=True` +13. Validate with `nwbinspector` +14. Write the README with installation and usage instructions + +--- + +## Appendix C: NWB Containers Quick Reference + +When writing custom interfaces, you need to know which PyNWB types to use: + +| Data type | PyNWB class | Where to add it | +|-----------|-------------|-----------------| +| Raw electrophysiology | `ElectricalSeries` | `nwbfile.add_acquisition()` | +| LFP | `LFP` containing `ElectricalSeries` | `ecephys` processing module | +| Spike times | `Units` | `nwbfile.units` | +| Position | `Position` containing `SpatialSeries` | `behavior` processing module | +| Behavioral time series | `TimeSeries` | `behavior` processing module | +| Behavioral events | `TimeIntervals` | `behavior` processing module or `nwbfile.add_stimulus()` | +| Trials | built-in | `nwbfile.add_trial()` with `nwbfile.add_trial_column()` | +| Epochs | built-in | `nwbfile.add_epoch()` with `nwbfile.add_epoch_column()` | +| Calcium imaging | `OnePhotonSeries` or `TwoPhotonSeries` | `nwbfile.add_acquisition()` | +| ROI segmentation | `PlaneSegmentation` in `ImageSegmentation` | `ophys` processing module | +| Fluorescence traces | `RoiResponseSeries` in `Fluorescence` or `DfOverF` | `ophys` processing module | +| Stimulus events | `TimeIntervals` | `nwbfile.add_stimulus()` | +| Sleep states | `TimeIntervals` | custom processing module (e.g., `sleep`) | diff --git a/src/pyflask/ai/skill/phases/01-intake.md b/src/pyflask/ai/skill/phases/01-intake.md new file mode 100644 index 000000000..c1c67960a --- /dev/null +++ b/src/pyflask/ai/skill/phases/01-intake.md @@ -0,0 +1,318 @@ +## Phase 1: Experiment Discovery + +**Goal**: Build a complete picture of the lab's experiments, data modalities, and file organization. + +**Entry**: User invokes `/nwb-convert`, possibly with a path to their data. + +**Exit criteria**: You have a clear `experiment_spec` (written to `conversion_notes.md`) covering: +- What experiments were performed +- All data streams (raw and processed) for each experiment +- File formats for each stream +- How data is organized on disk (directory structure) +- Number of subjects and sessions +- Any special considerations (multiple probes, multiple FOVs, etc.) + +### Step 0a: Check Environment + +**Skip this step if running inside NWB GUIDE** (all packages are pre-installed). + +Before anything else, verify the required Python packages are installed. The skill +needs `neuroconv`, `pynwb`, `dandi`, and several inspection libraries. + +```bash +python3 -c " +missing = [] +for pkg, module in [ + ('neuroconv', 'neuroconv'), + ('pynwb', 'pynwb'), + ('dandi', 'dandi'), + ('nwbinspector', 'nwbinspector'), + ('spikeinterface', 'spikeinterface'), + ('h5py', 'h5py'), + ('remfile', 'remfile'), + ('pandas', 'pandas'), + ('pyyaml', 'yaml'), +]: + try: + __import__(module) + except ImportError: + missing.append(pkg) +if missing: + print('MISSING: ' + ' '.join(missing)) +else: + print('OK') +" +``` + +If packages are missing, install them: +```bash +pip install neuroconv pynwb dandi nwbinspector spikeinterface h5py remfile pandas pyyaml +``` + +The full environment specification is in `skills/nwb-convert/make_env.yml`. If the user +prefers conda, they can create the environment with: +```bash +conda env create -f /make_env.yml +conda activate nwb-convert +``` + +### Step 0b: Create Conversion Repo and Consult Registry + +Before the first user-facing question, set up the conversion repo and check for prior work. + +**Create the repo.** The skill calls the nwb-conversions API to create a private repo +in the `nwb-conversions` GitHub org. The user does NOT need a GitHub account — the API +handles authentication server-side. + +```bash +# API base URL (Cloudflare Worker) +NWB_API="https://nwb-conversions-api.ben-dichter.workers.dev" + +# Derive lab name from user context (ask if unclear) +LAB_NAME="" +REPO_NAME="${LAB_NAME}-to-nwb" + +# Create repo via API +RESPONSE=$(curl -sf -X POST "${NWB_API}/repos" \ + -H "Content-Type: application/json" \ + -d "{\"lab_name\": \"${LAB_NAME}\"}") + +if [ $? -eq 0 ]; then + PUSH_URL=$(echo "$RESPONSE" | python3 -c "import sys,json; print(json.load(sys.stdin)['push_url'])") + mkdir "${REPO_NAME}" && cd "${REPO_NAME}" + git init + git remote add origin "${PUSH_URL}" + git config user.name "nwb-conversions-bot" + git config user.email "nwb-conversions-bot@users.noreply.github.com" +else + # API unreachable — work locally only + mkdir "${REPO_NAME}" && cd "${REPO_NAME}" + git init +fi +``` + +If the API is unreachable, inform the user: +> I'll create a local conversion repo to organize the code. The conversion registry +> is not available right now, but this won't affect the conversion itself. + +All subsequent file creation should happen INSIDE this directory. When a remote is +configured, the skill pushes after every phase. + +**Seed the repo** with a `.gitignore` and initial commit: +```bash +cat > .gitignore << 'EOF' +# Python +__pycache__/ +*.py[cod] +*.egg-info/ +dist/ +build/ +*.egg + +# NWB output (don't commit data files) +*.nwb +nwb_output/ +nwb_stub/ + +# Environment +.env +*.log + +# OS +.DS_Store +Thumbs.db + +# IDE +.vscode/ +.idea/ +EOF + +git add .gitignore +git commit -m "Initial commit: add .gitignore" +if git remote get-url origin &>/dev/null; then git push; fi +``` + +**Fetch the conversion registry** to find similar prior conversions: +```bash +curl -sf "${NWB_API}/registry" > /tmp/registry.yaml || true +``` + +If the API is unreachable or the registry is empty, skip registry consultation and +proceed directly to the opening questions. + +**Search the registry** for relevant prior work. Look for matches on: +- Same species +- Same modalities (ecephys, ophys, behavior, icephys) +- Same file formats or interfaces +- Same recording systems (SpikeGLX, OpenEphys, Suite2p, etc.) + +```python +import yaml +from pathlib import Path + +registry_path = Path("/tmp/registry.yaml") +if registry_path.exists() and registry_path.stat().st_size > 0: + with open(registry_path) as f: + registry = yaml.safe_load(f) + + # Find conversions with matching modalities + target_modalities = {"ecephys", "behavior"} # from user description + for conv in registry.get("conversions", []): + overlap = target_modalities & set(conv.get("modalities", [])) + if overlap: + print(f"Similar: {conv['id']} ({conv['repo']})") + print(f" Modalities: {conv['modalities']}") + print(f" Interfaces: {conv['interfaces']}") + if conv.get("lessons"): + print(f" Lessons: {conv['lessons']}") +``` + +If you find relevant prior conversions, mention them to the user: +> I found N similar conversions in our registry that used the same recording system / +> modalities. I'll use those as references as we build yours. + +If the registry is empty or has no matches, proceed normally — this is expected for early conversions. + +### Opening Questions + +Start with broad, open-ended questions. Don't ask all at once — ask 2-3, then follow up. + +**First message should be something like:** +> I'd like to help you convert your data to NWB and publish it on DANDI. Let's start by +> understanding your experiment. +> +> 1. Can you briefly describe your experiment? What were you studying? +> 2. What types of neural recordings did you collect? (e.g., extracellular electrophysiology, +> calcium imaging, intracellular recordings, etc.) +> 3. Did you also record behavioral data? (e.g., position tracking, video, licking, running speed) + +**If the user provided a data path**, inspect the directory structure FIRST: +``` +ls -la +find -maxdepth 3 -type f | head -50 +``` +Then ask targeted questions based on what you see. + +### Follow-up Questions (ask as needed) + +**About recordings:** +- What recording system did you use? (e.g., SpikeGLX, OpenEphys, Intan, Blackrock, Neuralynx, Axona) +- How many probes/electrodes per session? +- Did you do spike sorting? What software? (Kilosort, Phy, CellExplorer, MountainSort) +- Is there LFP data separate from the raw recording? + +**About imaging:** +- What microscope/acquisition software? (ScanImage, Scanbox, Bruker, Inscopix, Miniscope) +- One-photon or two-photon? +- Did you run segmentation? What software? (Suite2p, CaImAn, CNMFE, EXTRACT) +- Single plane or multi-plane? + +**About behavior:** +- Is there pose estimation? (DeepLabCut, SLEAP, LightningPose) +- Video recordings? How many cameras? +- Trial structure? What defines a trial? +- Stimulus presentation? What software? (PsychoPy, Bpod, Arduino) +- Task events? (licks, rewards, tone presentations, etc.) + +**About organization:** +- How are files organized? One folder per session? Per subject? +- Is there a naming convention? +- Are there processed/analyzed files in addition to raw data? +- Approximately how many sessions total? + +**About existing resources (always ask these):** +- Is there a manuscript, preprint, or published paper describing this data? + (If yes, get the DOI or URL — this helps with experiment_description and related_publications) +- Is this data already publicly available in any non-NWB format? (e.g., on Figshare, Zenodo, + institutional repository, or another archive) +- Do you have existing analysis code for this data? (e.g., MATLAB scripts, Python notebooks) + These often reveal data structure, variable names, and processing steps that inform the conversion. +- Do you have any code that reads or converts this data to another format? + (Existing readers save significant reverse-engineering effort) + +### Fetching Publication Details + +When the user provides a DOI, PMID, PMC ID, or publication URL, use the paper fetcher tool +to retrieve the full text (or abstract). This is extremely valuable for understanding the +experiment, data modalities, recording parameters, and subject details. + +```bash +python3 tools/fetch_paper.py "" --extract methods +``` + +The tool accepts DOIs (e.g., `10.1038/s41586-019-1234-5`), PMIDs (e.g., `31234567`), +PMC IDs (e.g., `PMC6789012`), or URLs from doi.org, PubMed, or PMC. + +**What to extract from the paper:** +1. **Methods section** (`--extract methods`): Recording systems, file formats, number of + subjects/sessions, experimental protocols, data acquisition parameters +2. **Abstract** (`--extract abstract`): High-level experiment description for `experiment_description` +3. **Full text** (no `--extract` flag): When you need comprehensive details + +**How to use the information:** +- Pre-fill the experiment description from the abstract +- Identify data modalities and recording systems from methods +- Extract subject counts, species, and session details +- Find stimulus/behavioral task descriptions +- Get the DOI for `related_publications` (format: `"doi:10.xxxx/xxxxx"`) +- Look for mentions of data availability statements that may link to existing public data + +After fetching, confirm key details with the user — papers may describe a larger study +than what the user is converting, or parameters may have changed. + +**About subjects (collect early to plan per-subject metadata):** +- How many subjects are in this dataset? +- Do you have a spreadsheet or file with subject information? +- For each subject, we'll need: subject_id, date of birth (or age at each session), + species (Latin binomial, e.g., "Mus musculus"), sex, genotype, and ideally weight. +- Are there different experimental groups (e.g., different genotypes, treatment vs. control)? + +### What to Record + +After this phase, update `conversion_notes.md` with: + +```markdown +# Conversion Notes + +## Experiment Overview +[Brief description of the experiment] + +## Data Streams +| Stream | Format | Recording System | File Pattern | NeuroConv Interface? | +|--------|--------|-----------------|--------------|---------------------| +| Raw ephys | SpikeGLX .bin | Neuropixel | *_g0_t0.imec0.ap.bin | SpikeGLXRecordingInterface | +| LFP | SpikeGLX .bin | Neuropixel | *_g0_t0.imec0.lf.bin | SpikeGLXLFPInterface | +| Spike sorting | Phy | Kilosort+Phy | phy/ folder | PhySortingInterface | +| Behavior | .txt files | Custom | *position.txt, *licks.txt | Custom needed | + +## Directory Structure +[Description or tree output] + +## Sessions +- Number of subjects: X +- Number of sessions: ~Y +- Session naming convention: ... + +## Existing Resources +- Publication: [DOI or "not yet published"] +- Existing public data: [URL or "none"] +- Analysis code: [URL or path or "none"] +- Existing data readers: [description or "none"] + +## Subjects +| subject_id | species | sex | date_of_birth | genotype | weight | group | +|------------|---------|-----|---------------|----------|--------|-------| +| ... | Mus musculus | M | 2019-10-22 | C57BL/6J | 25 g | control | + +## Open Questions +- [ ] ... +``` + +### Push Phase 1 Results + +After writing `conversion_notes.md`, commit and push: +```bash +git add conversion_notes.md +git commit -m "Phase 1: experiment discovery — data streams and directory structure" +if git remote get-url origin &>/dev/null; then git push; fi +``` diff --git a/src/pyflask/ai/skill/phases/02-data-inspection.md b/src/pyflask/ai/skill/phases/02-data-inspection.md new file mode 100644 index 000000000..9406c4469 --- /dev/null +++ b/src/pyflask/ai/skill/phases/02-data-inspection.md @@ -0,0 +1,157 @@ +## Phase 2: Data Inspection + +**Goal**: Inspect actual data files to confirm formats, understand structure, and map to NeuroConv interfaces. + +**Entry**: You have a general understanding of the experiment from Phase 1. + +**Exit criteria**: For each data stream, you know: +- The exact file format and can read it programmatically +- Which NeuroConv interface handles it (or that custom code is needed) +- The source_data arguments needed (file paths, stream IDs, etc.) +- Any quirks or issues (corrupt files, missing headers, unusual organization) + +### Cross-Reference with Conversion Registry + +Before inspecting files, check the registry's `format_hints` to accelerate interface identification. +If the registry was fetched in Phase 1, use it to pre-match file patterns: + +```python +import yaml +from fnmatch import fnmatch +from pathlib import Path + +registry_path = Path("/tmp/registry.yaml") +if not registry_path.exists() or registry_path.stat().st_size == 0: + print("Registry not available — skipping format hint matching") + registry = {"format_hints": []} +else: + with open(registry_path) as f: + registry = yaml.safe_load(f) + +# Collect actual filenames from the data directory +data_path = Path("") +filenames = [f.name for f in data_path.rglob("*") if f.is_file()] + +# Match filenames against registry format_hints using glob matching +matched_interfaces = {} # interface_name → list of (pattern, seen_in) +for hint in registry.get("format_hints", []): + for pattern in hint["patterns"]: + for filename in filenames: + if fnmatch(filename, pattern): + iface = hint["interface"] + if iface not in matched_interfaces: + matched_interfaces[iface] = [] + matched_interfaces[iface].append({ + "pattern": pattern, + "matched_file": filename, + "seen_in": hint["seen_in"], + }) + break # One match per pattern is enough + +for iface, matches in matched_interfaces.items(): + repos = set() + for m in matches: + repos.update(m["seen_in"]) + print(f"Registry match: {iface} (seen in {sorted(repos)})") + for m in matches: + print(f" {m['pattern']} matched {m['matched_file']}") +``` + +When a filename matches a `format_hint` pattern, you can proceed with higher confidence in the +interface selection. If the same pattern has been used successfully in prior conversions, +mention this to the user and skip exploratory probing for that stream. + +### Approach + +1. **Ask for a sample session** — a single, complete session with all data streams: + > Can you point me to one complete example session? I'd like to inspect the files + > to understand the exact format and structure. + +2. **Inspect files directly** using Python. For each data stream: + + **For electrophysiology (SpikeGLX, OpenEphys, etc.):** + ```python + # Try loading with spikeinterface + import spikeinterface.extractors as se + recording = se.read_spikeglx(folder_path, stream_id="imec0.ap") + print(f"Channels: {recording.get_num_channels()}") + print(f"Sampling rate: {recording.get_sampling_frequency()}") + print(f"Duration: {recording.get_total_duration()}") + ``` + + **For spike sorting (Phy, Kilosort, etc.):** + ```python + sorting = se.read_phy(folder_path) + print(f"Units: {sorting.get_num_units()}") + print(f"Unit IDs: {sorting.get_unit_ids()}") + ``` + + **For calcium imaging (ScanImage, Scanbox, Suite2p, etc.):** + ```python + import roiextractors as re + imaging = re.read_scanbox(file_path) + print(f"FOV size: {imaging.get_image_size()}") + print(f"Num frames: {imaging.get_num_frames()}") + print(f"Sampling rate: {imaging.get_sampling_frequency()}") + ``` + + **For behavior files (.mat, .csv, .txt, .pkl, etc.):** + ```python + # For MATLAB files + import h5py # or scipy.io.loadmat for v5 .mat files + with h5py.File(path) as f: + print(list(f.keys())) + # Recursively explore structure + + # For CSV/text + import pandas as pd + df = pd.read_csv(path, sep='\t', nrows=5) + print(df.columns.tolist()) + print(df.head()) + ``` + +3. **Test NeuroConv interfaces** — for each data stream that has a matching interface, try instantiating it: + ```python + from neuroconv.datainterfaces import SpikeGLXRecordingInterface + interface = SpikeGLXRecordingInterface(folder_path=path, stream_id="imec0.ap") + metadata = interface.get_metadata() + print(metadata) + ``` + +4. **Identify custom interface needs** — for data streams with no NeuroConv interface: + - Document the file format, structure, and what data/metadata it contains + - Note what NWB types the data should map to (TimeSeries, SpatialSeries, TimeIntervals, etc.) + - Flag these for Phase 5 code generation + +### Common Gotchas + +- **MATLAB v7.3 files** use HDF5 format (use `h5py`), older versions use scipy.io.loadmat +- **Pickle files** may require specific package versions to deserialize +- **Text files** — check delimiter (tab vs comma vs space), header presence, encoding +- **SpikeGLX** — the meta file is essential; make sure .bin and .meta are co-located +- **Suite2p** — look for the `suite2p/plane0/` directory structure +- **Multiple probes** — SpikeGLX uses imec0, imec1, etc.; each needs its own interface instance + +### Update conversion_notes.md + +Add an "Interface Mapping" section: + +```markdown +## Interface Mapping +| Stream | Interface | source_data | Status | +|--------|-----------|-------------|--------| +| Raw AP | SpikeGLXRecordingInterface | folder_path, stream_id="imec0.ap" | Verified | +| LFP | SpikeGLXLFPInterface | folder_path, stream_id="imec0.lf" | Verified | +| Sorting | PhySortingInterface | folder_path | Verified | +| VR position | CUSTOM: VRBehaviorInterface | file_path | Needs implementation | +| Lick events | CUSTOM: EventsInterface | folder_path | Needs implementation | +``` + +### Push Phase 2 Results + +After updating `conversion_notes.md` with the interface mapping, commit and push: +```bash +git add conversion_notes.md +git commit -m "Phase 2: data inspection — interface mapping and format details" +if git remote get-url origin &>/dev/null; then git push; fi +``` diff --git a/src/pyflask/ai/skill/phases/03-metadata.md b/src/pyflask/ai/skill/phases/03-metadata.md new file mode 100644 index 000000000..64bb532d2 --- /dev/null +++ b/src/pyflask/ai/skill/phases/03-metadata.md @@ -0,0 +1,191 @@ +## Phase 3: Metadata Collection + +**Goal**: Gather all metadata required for a complete, valid NWB file. + +**Entry**: You know all data streams and their interfaces from Phase 2. + +**Exit criteria**: You have complete metadata for: +- NWBFile-level fields (session_description, experiment_description, institution, lab, etc.) +- Subject fields (species, sex, age, genotype, subject_id) +- Device and electrode/imaging plane descriptions +- Session-specific fields (session_start_time with timezone, session_id) +- Trial/epoch structure if applicable + +### Required NWB Metadata + +**NWBFile (ask the user for these):** +- `session_description` — What happened in this session? (Required by NWB) +- `experiment_description` — Overall experiment description (can be paper abstract) +- `institution` — University/institute name +- `lab` — PI's lab name +- `experimenter` — List of experimenters as ["Last, First"] +- `keywords` — Relevant keywords for discoverability +- `related_publications` — DOI format: `"doi:10.xxxx/xxxxx"` (not URLs) + +**Subject (ask the user for these):** +- `species` — Latin binomial (e.g., "Mus musculus", "Rattus norvegicus", "Homo sapiens") or NCBI taxonomy URI +- `sex` — One of: "M", "F", "U" (unknown), "O" (other). Single uppercase letter only. +- `age` — ISO 8601 duration: "P90D" (90 days), "P12W" (12 weeks), "P3M" (3 months). Can be a range: "P90D/P120D" +- `subject_id` — Unique identifier (required for DANDI) +- `genotype` — If transgenic +- `strain` — e.g., "C57BL/6J" (separate from species) +- `date_of_birth` — Preferred over `age` when available (datetime with timezone) +- `weight` — Format as "numeric unit": "0.025 kg" or "25 g" (not just a number) +- `description` — Any additional notes + +### Modality-Specific Metadata + +**For ophys (calcium imaging) experiments, also ask:** +- What brain region were you imaging? (e.g., "CA1", "V1", "mPFC") +- What calcium indicator did you use? (e.g., "GCaMP6f", "GCaMP7f", "jRGECO1a") +- What was the excitation wavelength? (e.g., 920 nm for GCaMP, 1040 nm for jRGECO) +- What objective did you use? (e.g., "Nikon 16x/0.8w") +- Single-plane or multi-plane imaging? + +These map to NWB metadata: +```yaml +Ophys: + Device: + - name: Microscope + description: Two-photon microscope + manufacturer: Scanbox # or Bruker, Thorlabs, etc. + ImagingPlane: + - name: ImagingPlane + description: Imaging plane in hippocampal CA1 + excitation_lambda: 920.0 + indicator: GCaMP6f + location: CA1 +``` + +**For ecephys (extracellular electrophysiology), also ask:** +- What brain region(s) were you recording from? (Use Allen Brain Atlas terminology for mice, e.g., "CA1", "VISp", "MOs") +- What probe model? (e.g., Neuropixels 1.0, Neuropixels 2.0, Cambridge NeuroTech H2) +- How many probes per session? +- Do you have histology-confirmed electrode locations? (If so, these should override intended targets) + +These are usually auto-extracted from SpikeGLX/OpenEphys metadata, but confirm with the user. +Note: every electrode MUST have a `location` value — use "unknown" if the region is truly unknown. + +**Session-specific (often extracted from data):** +- `session_start_time` — MUST include timezone (e.g., America/New_York) +- `session_id` — Unique session identifier + +### How to Ask + +Don't dump a giant form. Instead, ask in context: + +> Now I need to collect some metadata for the NWB files. Let me start with the basics: +> +> 1. What institution and lab is this from? +> 2. Who are the experimenters? (First and last names) +> 3. What species are the subjects? Are they a specific strain or transgenic line? + +Then follow up: +> For the NWB files, I need a session description (what happened in a typical session) +> and an experiment description (the overall goal — this could be the abstract from +> your paper if you have one). Can you provide these? + +### Metadata That Can Be Auto-Extracted + +Many fields come from the data files themselves. Check what the interfaces provide: +```python +converter = MyNWBConverter(source_data=source_data) +metadata = converter.get_metadata() +print(json.dumps(metadata, indent=2, default=str)) +``` + +Typically auto-extracted: +- `session_start_time` from SpikeGLX, OpenEphys, ScanImage headers +- `Device` info (probe model, serial number) from SpikeGLX meta files +- `ElectrodeGroup` and electrode positions from probe geometry +- Sampling rates, channel counts + +### Where Metadata Goes + +Metadata is stored in a `metadata.yaml` file alongside the conversion code: + +```yaml +NWBFile: + experiment_description: > + We recorded neural activity in the medial entorhinal cortex + while mice navigated a virtual reality track. + institution: Stanford University + lab: Giocomo Lab + experimenter: + - Wen, John + - Giocomo, Lisa + keywords: + - virtual reality + - entorhinal cortex + - navigation + related_publications: + - https://doi.org/10.xxxx/xxxxx +Subject: + species: Mus musculus + strain: C57BL/6J + sex: M +``` + +Session-specific metadata (subject_id, session_start_time) is set programmatically +in `convert_session.py` since it varies per session. + +### Push Phase 3 Results + +After collecting metadata, commit and push the metadata files: +```bash +git add conversion_notes.md metadata.yaml subject_metadata.yaml 2>/dev/null +git commit -m "Phase 3: metadata collection — NWBFile, Subject, and device metadata" +if git remote get-url origin &>/dev/null; then git push; fi +``` + +### Per-Subject Metadata + +You MUST collect subject-level metadata for each subject. This is required for DANDI upload. + +For each subject, collect: +- `subject_id` — **Required**. Unique identifier. +- `species` — **Required**. Latin binomial (e.g., "Mus musculus", "Rattus norvegicus"). +- `sex` — **Recommended**. One of "M", "F", "U", "O". +- `date_of_birth` — **Recommended**. Or `age` per session as ISO 8601 duration (e.g., "P90D"). +- `genotype` — **Recommended** if transgenic. +- `weight` — **Recommended**. At time of experiment or implant. +- `strain` — **Recommended** (e.g., "C57BL/6J"). + +If there are multiple subjects, create a `subject_metadata.yaml` (or `.json`) keyed by +subject_id: + +```yaml +N2: + species: Mus musculus + strain: C57BL/6J + sex: M + date_of_birth: 2019-10-22 + weight: "0.025 kg" +R5: + species: Mus musculus + genotype: CaMKII-cre hemizygous + sex: F + date_of_birth: 2019-06-15 + weight: "0.022 kg" +``` + +Ask the user if they have a spreadsheet or JSON file with this information. If they have +analysis code, it often contains subject metadata as a lookup table or config file. + +### Timezone Handling + +Session start times MUST have timezone information. Ask the user: +> What timezone was the data collected in? + +Common US timezones: +- `America/New_York` (Eastern) +- `America/Chicago` (Central) +- `America/Denver` (Mountain) +- `America/Los_Angeles` (Pacific) + +Use `zoneinfo.ZoneInfo` in the conversion code: +```python +from zoneinfo import ZoneInfo +tz = ZoneInfo("America/Los_Angeles") +metadata["NWBFile"]["session_start_time"] = session_start_time.replace(tzinfo=tz) +``` diff --git a/src/pyflask/ai/skill/phases/04-sync.md b/src/pyflask/ai/skill/phases/04-sync.md new file mode 100644 index 000000000..587b9585e --- /dev/null +++ b/src/pyflask/ai/skill/phases/04-sync.md @@ -0,0 +1,112 @@ +## Phase 4: Synchronization Analysis + +**Goal**: Understand how different data streams are temporally aligned and implement sync logic. + +**Entry**: You know all data streams and interfaces from Phase 2. + +**Exit criteria**: For every pair of data streams, you know: +- Whether they share a clock (same timestamps) +- If not, how to align them (TTL pulses, shared events, known offsets) +- The specific implementation plan for temporal alignment + +### Why This Matters + +NWB requires all data in a file to share a common time base. Different recording systems +often run on independent clocks that drift relative to each other. Without proper sync, +behavioral events won't align with neural data. + +### Common Synchronization Patterns + +**Pattern 1: Shared clock (simplest)** +- All data comes from the same system (e.g., SpikeGLX records both neural and NIDQ) +- Or all data was processed together with aligned timestamps +- Action: No sync needed — timestamps are already aligned + +**Pattern 2: TTL pulse alignment** +- One system sends TTL pulses that are recorded by another +- E.g., behavior computer sends trial start TTLs recorded on SpikeGLX NIDQ channel +- Action: Extract TTL times from both streams, use as alignment anchors + +```python +# In NWBConverter.temporally_align_data_interfaces(): +from spikeinterface.extractors import SpikeGLXRecordingExtractor +nidq_recording = SpikeGLXRecordingExtractor(folder_path=path, stream_id="nidq") +nidq_data = nidq_recording.get_traces(channel_ids=["nidq#XA2"]) +# Find rising edges +rising_edges = np.where(np.diff((nidq_data > threshold).astype(int)) > 0)[0] +ttl_times_neural = rising_edges / nidq_recording.get_sampling_frequency() + +# Compare with behavioral event times to compute offset +offset = np.mean(ttl_times_neural[:n] - behavioral_event_times[:n]) +``` + +**Pattern 3: Starting time offset** +- Streams start at different times but run at the same rate +- Action: Compute the offset and use `set_aligned_starting_time()` + +```python +interface.set_aligned_starting_time(offset_seconds) +``` + +**Pattern 4: Interpolation between clocks** +- Streams run on different clocks that may drift +- Periodic sync pulses recorded by both systems +- Action: Use `align_by_interpolation()` with matched timepoints + +```python +interface.align_by_interpolation( + unaligned_timestamps=sync_times_in_this_clock, + aligned_timestamps=sync_times_in_reference_clock +) +``` + +**Pattern 5: Frame-based alignment (imaging)** +- Behavioral data logged per imaging frame +- Action: Use imaging frame times as the time base + +**Pattern 6: Multi-clock interpolation (complex)** +- Multiple independent clocks need cross-alignment (e.g., odor clock, behavior clock, imaging clock) +- Action: Chain interpolations through a reference clock + +### Questions to Ask + +> I need to understand how your data streams are synchronized: +> +> 1. Do all your recording systems share a common clock, or are they independent? +> 2. Do you use any synchronization signals (TTL pulses, sync LEDs, shared triggers)? +> 3. If so, which system generates the sync signal and which systems record it? +> 4. Is there a master clock that everything is referenced to? + +Follow up based on answers: +- If TTL: Which channel? What does the pulse pattern mean? (rising edge = trial start?) +- If shared clock: How? (same DAQ, hardware sync, network time?) +- If no sync: Is approximate alignment acceptable? Do files have wall-clock timestamps? + +### What to Record + +Update `conversion_notes.md`: + +```markdown +## Synchronization +- Reference clock: SpikeGLX neural recording +- Behavior → Neural: TTL pulses on NIDQ channel XA2, rising edge = epoch start +- Imaging → Neural: Frame trigger on NIDQ channel XA0 +- Method: Compute mean offset from first N TTL events + +### Sync Implementation Plan +Override `temporally_align_data_interfaces()` in the NWBConverter: +1. Read NIDQ channel XA2 from SpikeGLX +2. Find rising edges → neural epoch times +3. Compare with behavioral file epoch boundaries +4. Compute mean offset +5. Shift all behavioral timestamps by offset +``` + +### Push Phase 4 Results + +After documenting the sync plan, commit and push: +```bash +git add conversion_notes.md +git commit -m "Phase 4: synchronization analysis — sync plan documented" +if git remote get-url origin &>/dev/null; then git push; fi +``` diff --git a/src/pyflask/ai/skill/phases/05-code-generation.md b/src/pyflask/ai/skill/phases/05-code-generation.md new file mode 100644 index 000000000..856254732 --- /dev/null +++ b/src/pyflask/ai/skill/phases/05-code-generation.md @@ -0,0 +1,532 @@ +## Phase 5: Code Generation + +**Goal**: Generate a complete, pip-installable conversion repo following CatalystNeuro conventions. + +**Entry**: You have complete experiment spec, interface mapping, metadata, and sync plan. + +**Exit criteria**: A working repo with: +- Correct directory structure (cookiecutter pattern) +- `pyproject.toml` with proper dependencies +- NWBConverter class with all interfaces +- `convert_session.py` with full pipeline +- Custom DataInterface classes where needed +- `metadata.yaml` with all collected metadata +- `convert_all_sessions.py` for batch conversion + +### Step 1: Scaffold the Repository + +Create the standard directory structure INSIDE the repo that was cloned in Phase 1 +(`nwb-conversions/-to-nwb/`). All files below are relative to the repo root: + +``` +./ ← repo root (already cloned from Phase 1) +├── .gitignore ← already created in Phase 1 +├── pyproject.toml +├── README.md +├── make_env.yml +└── src/ + └── _to_nwb/ + ├── __init__.py + └── / + ├── __init__.py + ├── nwbconverter.py + ├── convert_session.py + ├── convert_all_sessions.py + ├── metadata.yaml + └── .py (if needed) +``` + +### Step 2: Write pyproject.toml + +```toml +[project] +name = "-lab-to-nwb" +version = "0.0.1" +description = "NWB conversion scripts for the Lab." +readme = "README.md" +requires-python = ">=3.11" +license = { text = "MIT" } +authors = [{ name = "CatalystNeuro", email = "ben.dichter@catalystneuro.com" }] +dependencies = ["neuroconv", "nwbinspector"] + +[project.optional-dependencies] + = [ + "neuroconv[]==", + # Add any additional deps needed for custom interfaces +] + +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[tool.hatch.build] +include = ["*.yaml", "*.yml", "*.json"] + +[tool.hatch.build.targets.wheel] +packages = ["src/_lab_to_nwb"] +``` + +**Extras for NeuroConv** depend on which interfaces are used: +- SpikeGLX: `neuroconv[spikeglx]` +- OpenEphys: `neuroconv[openephys]` +- Phy: `neuroconv[phy]` +- Suite2p: `neuroconv[suite2p]` +- DeepLabCut: `neuroconv[deeplabcut]` +- Check NeuroConv's pyproject.toml for all available extras + +### Step 3: Write the NWBConverter Class + +```python +from neuroconv import NWBConverter +from neuroconv.datainterfaces import ( + # Import NeuroConv interfaces based on interface mapping +) +# Import custom interfaces +from . import CustomInterface + + +class NWBConverter(NWBConverter): + """Primary conversion class.""" + + data_interface_classes = dict( + # Map logical names to interface classes + # Names should be descriptive: Recording, LFP, Sorting, Behavior, etc. + ) + + def temporally_align_data_interfaces(self): + """Override if sync logic is needed.""" + # Implement sync plan from Phase 4 + pass +``` + +### Step 3b: Check Registry for Reusable Custom Interfaces + +Before writing a custom interface from scratch, check the conversion registry for +similar custom interfaces from prior conversions. A prior interface that handles the +same data format or creates the same NWB types can serve as a starting template. + +```python +import yaml + +with open("/tmp/registry.yaml") as f: + registry = yaml.safe_load(f) + +# Search for conversions with custom interfaces that match what we need +needed_nwb_types = ["Position", "BehavioralEvents"] # what our custom data maps to +for conv in registry.get("conversions", []): + if not conv.get("has_custom_interfaces"): + continue + # The full manifest has custom_interfaces detail — fetch it from the repo + print(f"Check {conv['repo']} for custom interfaces") +``` + +If a match is found, fetch the actual interface code from the prior repo via the API: +```bash +NWB_API="https://nwb-conversions-api.ben-dichter.workers.dev" +curl -sf "${NWB_API}/repos//files/" +``` + +Use the fetched code as a starting template, adapting it to the current lab's file format +and column names. Give credit in a comment: `# Adapted from nwb-conversions/`. + +If no match is found, write the custom interface from scratch (Step 4 below). + +### Step 4: Write Custom DataInterface Classes + +For each data stream that needs custom code: + +```python +from neuroconv.basedatainterface import BaseDataInterface +from neuroconv.utils import DeepDict +from pynwb.file import NWBFile + + +class Interface(BaseDataInterface): + """Interface for reading .""" + + keywords = [""] + + def __init__(self, file_path: str): + """ + Parameters + ---------- + file_path : str + Path to the file. + """ + super().__init__(file_path=file_path) + + def get_metadata(self) -> DeepDict: + metadata = super().get_metadata() + # Extract any metadata from the file + return metadata + + def add_to_nwbfile(self, nwbfile: NWBFile, metadata: dict, **kwargs): + # Read data from self.source_data["file_path"] + # Create appropriate PyNWB objects + # Add to nwbfile + pass +``` + +#### Custom Interface Guidelines + +**Metadata responsibility**: A custom interface's `get_metadata()` should only return +metadata that can be extracted FROM THE DATA FILE ITSELF (e.g., session date from filename, +frame rate from timestamps). Lab-level metadata (institution, experimenter) and subject +metadata (species, genotype) should be handled in `convert_session.py` via metadata YAML +and subject metadata files. Do not duplicate metadata loading between the interface and +the conversion script. + +**Use `conversion` parameter, not data transformation**: When data is in non-SI units +(e.g., centimeters), do NOT multiply the data by a conversion factor. Instead, use the +`conversion` parameter on TimeSeries: +```python +# CORRECT: store raw data, use conversion factor +TimeSeries(name="position", data=pos_cm, unit="m", conversion=0.01) + +# WRONG: transform data in-place +TimeSeries(name="position", data=pos_cm * 0.01, unit="m") +``` +This preserves original data values in the file and is more NWB-idiomatic. + +**Set `resolution` when unknown**: If you don't know the resolution (smallest meaningful +difference) of a data stream, explicitly set `resolution=-1.0`. Don't leave it unset. + +**Pickle files cannot be lazily loaded.** Unlike HDF5 or binary files, pickle requires +reading the entire file into memory. This is an acceptable exception to the "load data +lazily in `__init__`" guideline. If the pickle is very large, consider loading only in +`add_to_nwbfile()` instead of `__init__()`. + +**Choosing the right NWB types for custom data:** + +Always use the most specific NWB type available — don't use bare `TimeSeries` when a +subtype exists. See `knowledge/nwb-best-practices.md` for the full set of conventions. + +| Data Type | NWB Container | Where to Add | +|-----------|---------------|--------------| +| Continuous neural signal | `ElectricalSeries` | `nwbfile.add_acquisition()` | +| Position (x, y) | `Position` > `SpatialSeries` | `processing["behavior"]` | +| Running speed | `TimeSeries` | `processing["behavior"]` | +| Lick times | `TimeSeries` (binary) or ndx-events `Events` | `processing["behavior"]` | +| Trial info | `TimeIntervals` | `nwbfile.add_trial()` | +| Epochs | `TimeIntervals` | `nwbfile.add_epoch()` | +| Pupil tracking | `PupilTracking` > `TimeSeries` | `processing["behavior"]` | +| Eye position | `EyeTracking` > `SpatialSeries` | `processing["behavior"]` | +| Stimulus times | `TimeIntervals` | `nwbfile.add_stimulus()` | +| Fluorescence traces | `RoiResponseSeries` | `processing["ophys"]` | +| ROI masks | `PlaneSegmentation` | `processing["ophys"]` | +| Reward events | `TimeSeries` or `LabeledEvents` | `processing["behavior"]` | +| Animal video | `ImageSeries` (external_file) | `nwbfile.add_acquisition()` | +| Compass direction | `CompassDirection` > `SpatialSeries` | `processing["behavior"]` | +| Optogenetic stimulus | `OptogeneticSeries` | `nwbfile.add_stimulus()` | + +**For detailed PyNWB construction patterns by domain, see:** +- `knowledge/pynwb-icephys.md` — intracellular electrophysiology +- `knowledge/pynwb-optogenetics.md` — optogenetic stimulation +- `knowledge/pynwb-ophys-advanced.md` — advanced optical physiology (ROIs, segmentation, motion correction) +- `knowledge/pynwb-behavior.md` — behavior container types (PupilTracking, EyeTracking, etc.) +- `knowledge/pynwb-images.md` — image data and external video files +- `knowledge/pynwb-advanced-io.md` — compression, chunking, iterative write for large data +- `knowledge/ndx-fiber-photometry.md` — ndx-fiber-photometry extension (REQUIRED for fiber photometry) +- `knowledge/ndx-pose.md` — ndx-pose extension for pose estimation (DeepLabCut, SLEAP, Lightning Pose) +- `knowledge/ndx-anatomical-localization.md` — ndx-anatomical-localization for electrode/imaging plane atlas registration + +**Single-photon vs. two-photon imaging:** +Miniscope data (UCLA Miniscope, Inscopix nVista/nVoke) is **single-photon** (one-photon) +imaging and MUST use `OnePhotonSeries`, not `TwoPhotonSeries`. Two-photon imaging +(ScanImage, Scanbox, Bruker, Prairie) uses `TwoPhotonSeries`. Getting this wrong is a +common mistake. Check: +- Miniscope → `OnePhotonSeries` (via `MiniscopeImagingInterface`) +- Inscopix → `OnePhotonSeries` (via `InscopixImagingInterface`) +- ScanImage, Scanbox, Bruker → `TwoPhotonSeries` +- If unsure, ask the user whether their microscope uses one-photon or two-photon excitation. + +**Key constraints on SpatialSeries:** +- Only for position data (x, y, z). Velocity and acceleration should use `TimeSeries`. +- Must have 1, 2, or 3 data columns (not more). +- When inside `CompassDirection`, units must be `"degrees"` or `"radians"`. +- When using degrees, data values should be in [-360, 360]; radians in [-2pi, 2pi]. + +#### Behavioral vs. Stimulus Data + +When a dataset has both behavioral and stimulus columns (common in VR experiments), +separate them: + +**Behavioral data** → `processing["behavior"]` via `BehavioralTimeSeries`, `Position`, etc.: +- Position / spatial location +- Running speed / velocity +- Lick events / lick rate +- Eye position / pupil diameter +- Pose estimation keypoints + +**Stimulus data** → `nwbfile.add_stimulus()`: +- Visual stimulus parameters (contrast, orientation, spatial frequency) +- Environment parameters (morph value, jitter) +- Optogenetic stimulus waveforms +- Auditory stimulus parameters + +**Reward** can go in either, but prefer `processing["behavior"]` if it represents the +animal's experience (reward delivery events), or `nwbfile.add_stimulus()` if it represents +an experimenter-controlled parameter. + +**Use `get_module()` to get or create processing modules:** +```python +from neuroconv.tools.nwb_helpers import get_module +behavior_module = get_module(nwbfile, "behavior", "Processed behavioral data") +behavior_module.add(my_container) +``` + +**Use `H5DataIO` for compression:** +```python +from hdmf.backends.hdf5.h5_utils import H5DataIO +data_compressed = H5DataIO(data=my_array, compression="gzip") +``` + +#### Time Series Best Practices (from NWB Inspector) + +Follow these in every custom interface and `add_to_nwbfile()` method: + +1. **Time-first orientation**: data shape must be `(n_timepoints, ...)`. If source data is + `(channels, timepoints)`, transpose before adding: `data = data.T` +2. **Timestamps in seconds**: all timestamps are in seconds relative to `session_start_time`. +3. **Ascending, non-negative, no NaN**: timestamps must be sorted ascending, >= 0, no NaN. +4. **Use `rate` for regular sampling**: if the signal has a constant sampling rate, use + `rate=` and `starting_time=` instead of a `timestamps` array. +5. **SI units via `conversion`**: set `unit` to the SI unit (e.g., `"m"`, `"V"`) and use + `conversion` to express the factor from stored data to SI. +6. **Every text field must be meaningful**: no empty strings for `description`, `unit`, etc. +7. **Breaks in recording**: if there are gaps, use explicit `timestamps` (not `rate`) or + create separate TimeSeries objects per continuous segment. + +#### Table Best Practices + +When creating DynamicTable objects (trials, epochs, electrodes, custom tables): + +- **Boolean columns**: name with `is_` prefix (e.g., `is_correct`, `is_rewarded`) +- **Timing columns**: name with `_time` suffix (e.g., `start_time`, `reward_time`) +- **No JSON strings**: don't encode structured data as JSON in string columns +- **No empty tables**: don't create tables with zero rows +- **Unique IDs**: keep the default auto-incrementing `id` column + +#### Ecephys Best Practices + +When working with electrodes and spike sorting data: + +- **Electrode `location` is required**: always fill it. Use Allen Brain Atlas terms for mice. + Use `"unknown"` only if the region is truly unknown. +- **Don't duplicate metadata in electrodes table**: don't add `unit`, `gain`, or `offset` + columns. Those belong on `ElectricalSeries` (as `channel_conversion` and `offset`). +- **Spike times must be ascending and positive**: verify sorted order, no negative values. +- **Use `obs_intervals`** on the units table if the recording has gaps. + +#### Video Best Practices + +- **Animal behavior videos** (webcam, running wheel cam): store as external files using + `ImageSeries(external_file=[relative_path], ...)`. Use relative paths. +- **Neural imaging data** (two-photon, miniscope): store internally with lossless compression. +- **Don't set `starting_frame`** unless using `external_file`. + +### Step 5: Write convert_session.py + +Follow the standard pattern: + +```python +from pathlib import Path +from typing import Union +from zoneinfo import ZoneInfo + +from neuroconv.utils import load_dict_from_file, dict_deep_update + +from . import NWBConverter + + +def session_to_nwb( + data_dir_path: Union[str, Path], + output_dir_path: Union[str, Path], + stub_test: bool = False, +): + data_dir_path = Path(data_dir_path) + output_dir_path = Path(output_dir_path) + if stub_test: + output_dir_path = output_dir_path / "nwb_stub" + output_dir_path.mkdir(parents=True, exist_ok=True) + + # Determine session_id and subject_id from path/filenames + session_id = "..." + subject_id = "..." + nwbfile_path = output_dir_path / f"{session_id}.nwb" + + # Build source_data + source_data = dict() + conversion_options = dict() + + # Add each interface with its file paths + source_data["Recording"] = dict(folder_path=str(data_dir_path / "...")) + conversion_options["Recording"] = dict(stub_test=stub_test) + + # Conditionally add interfaces if files exist + behavior_path = data_dir_path / "behavior.txt" + if behavior_path.is_file(): + source_data["Behavior"] = dict(file_path=str(behavior_path)) + conversion_options["Behavior"] = dict() + + # Create converter + converter = NWBConverter(source_data=source_data) + + # Get and merge metadata + metadata = converter.get_metadata() + + metadata_path = Path(__file__).parent / "metadata.yaml" + editable_metadata = load_dict_from_file(metadata_path) + metadata = dict_deep_update(metadata, editable_metadata) + + # Set session-specific metadata + tz = ZoneInfo("") + if metadata["NWBFile"]["session_start_time"]: + metadata["NWBFile"]["session_start_time"] = ( + metadata["NWBFile"]["session_start_time"].replace(tzinfo=tz) + ) + metadata["NWBFile"]["session_id"] = session_id + + # Subject metadata — subject_id is required for DANDI + metadata["Subject"]["subject_id"] = subject_id + # Load per-subject metadata from file if available + # See knowledge/nwb-best-practices.md for required formats: + # species: Latin binomial (e.g., "Mus musculus") + # sex: one of "M", "F", "U", "O" + # age: ISO 8601 duration (e.g., "P90D") + # weight: "numeric unit" (e.g., "0.025 kg") + + # Run conversion + converter.run_conversion( + nwbfile_path=nwbfile_path, + metadata=metadata, + conversion_options=conversion_options, + overwrite=True, + ) + + +if __name__ == "__main__": + # Example usage + data_dir_path = Path("/path/to/data") + output_dir_path = Path("/path/to/output") + session_to_nwb( + data_dir_path=data_dir_path, + output_dir_path=output_dir_path, + stub_test=True, # Set to False for full conversion + ) +``` + +### Step 6: Write convert_all_sessions.py + +```python +from pathlib import Path +from concurrent.futures import ProcessPoolExecutor +import traceback + +from .convert_session import session_to_nwb + + +def get_session_to_nwb_kwargs_per_session(data_dir_path): + """Discover all sessions and return kwargs for each.""" + # Implement session discovery logic + # Return list of dicts, each with kwargs for session_to_nwb + raise NotImplementedError("Implement session discovery") + + +def safe_session_to_nwb(**kwargs): + """Wrapper that catches and logs exceptions.""" + exception_file_path = kwargs.pop("exception_file_path", None) + try: + session_to_nwb(**kwargs) + except Exception: + if exception_file_path: + with open(exception_file_path, "w") as f: + f.write(traceback.format_exc()) + else: + raise + + +def dataset_to_nwb( + data_dir_path, + output_dir_path, + max_workers=1, + stub_test=False, +): + data_dir_path = Path(data_dir_path) + output_dir_path = Path(output_dir_path) + exception_dir = output_dir_path / "exceptions" + exception_dir.mkdir(parents=True, exist_ok=True) + + kwargs_list = get_session_to_nwb_kwargs_per_session(data_dir_path) + + with ProcessPoolExecutor(max_workers=max_workers) as executor: + for kwargs in kwargs_list: + kwargs["output_dir_path"] = output_dir_path + kwargs["stub_test"] = stub_test + session_id = kwargs.get("session_id", "unknown") + kwargs["exception_file_path"] = str(exception_dir / f"{session_id}.txt") + executor.submit(safe_session_to_nwb, **kwargs) +``` + +### Step 7: Write metadata.yaml + +Use the metadata collected in Phase 3. See Phase 3 for format. + +### Step 8: Write README.md + +```markdown +# -lab-to-nwb + +NWB conversion scripts for the [ Lab](lab_url) data, +using [NeuroConv](https://github.com/catalystneuro/neuroconv). + +## Installation + +```bash +pip install -lab-to-nwb +``` + +## Usage + +### Single session +```python +from ..convert_session import session_to_nwb + +session_to_nwb( + data_dir_path="/path/to/session", + output_dir_path="/path/to/output", + stub_test=False, +) +``` + +### All sessions +```python +from ..convert_all_sessions import dataset_to_nwb + +dataset_to_nwb( + data_dir_path="/path/to/data", + output_dir_path="/path/to/output", + max_workers=4, +) +``` +``` + +### Step 9: Commit and Push to nwb-conversions + +After all code is generated and the repo is scaffolded, commit everything and push to the +`nwb-conversions` GitHub org. The remote was set up in Phase 1 via `gh repo create --clone`. + +```bash +git add -A +git commit -m "Add conversion code for + +Generated by nwb-convert skill. Includes: +- NWBConverter with interfaces +- custom DataInterface classes +- convert_session.py and convert_all_sessions.py +- metadata.yaml with lab and experiment metadata" +if git remote get-url origin &>/dev/null; then git push; fi +``` + +This makes the conversion code immediately available in the org for reference by future +conversions. The manifest will be added in Phase 7 after DANDI upload is complete. diff --git a/src/pyflask/ai/skill/phases/06-testing.md b/src/pyflask/ai/skill/phases/06-testing.md new file mode 100644 index 000000000..ad3ca51f9 --- /dev/null +++ b/src/pyflask/ai/skill/phases/06-testing.md @@ -0,0 +1,231 @@ +## Phase 6: Testing & Validation + +**Goal**: Verify the conversion produces valid, complete NWB files. + +**Entry**: You have generated all conversion code from Phase 5. + +**Exit criteria**: The conversion runs successfully on at least one session, the output +passes nwbinspector validation, and the data can be read back correctly. + +### Step 1: Install the Package + +```bash +cd +pip install -e ".[]" +``` + +### Step 2: Run a Stub Test + +First, run with `stub_test=True` to convert a small subset of data quickly: + +```python +from ..convert_session import session_to_nwb + +session_to_nwb( + data_dir_path="/path/to/sample/session", + output_dir_path="/path/to/output", + stub_test=True, +) +``` + +If this fails, debug the error: +- Import errors → missing dependencies in pyproject.toml +- File not found → incorrect source_data paths +- Type errors → incorrect data shapes or types in custom interfaces +- Schema validation errors → metadata doesn't match expected schema + +### Step 3: Inspect the NWB File + +Read back the file and verify contents: + +```python +from pynwb import NWBHDF5IO + +with NWBHDF5IO("/path/to/output/session.nwb", "r") as io: + nwbfile = io.read() + + # Check basic metadata + print(f"Session: {nwbfile.session_description}") + print(f"Start time: {nwbfile.session_start_time}") + print(f"Subject: {nwbfile.subject}") + + # Check acquisition data + print(f"Acquisition: {list(nwbfile.acquisition.keys())}") + + # Check processing modules + for name, module in nwbfile.processing.items(): + print(f"Processing/{name}: {list(module.data_interfaces.keys())}") + + # Check units + if nwbfile.units: + print(f"Units: {len(nwbfile.units)} units") + + # Check trials + if nwbfile.trials: + print(f"Trials: {len(nwbfile.trials)} trials") + print(f"Trial columns: {nwbfile.trials.colnames}") + + # Check electrodes + if nwbfile.electrodes: + print(f"Electrodes: {len(nwbfile.electrodes)} electrodes") + + # Spot-check data values + for name, ts in nwbfile.acquisition.items(): + if hasattr(ts, 'data'): + print(f" {name}: shape={ts.data.shape}, dtype={ts.data.dtype}") +``` + +### Step 4: Run NWB Inspector + +**You MUST run nwbinspector on every converted file.** Do not skip this step or leave it for the user. + +Run it via bash and capture the full output: + +```bash +nwbinspector /path/to/output/session.nwb +``` + +Then analyze every message in the output. NWB Inspector reports issues at 4 severity levels: + +| Level | Meaning | Action Required | +|-------|---------|-----------------| +| `CRITICAL_IMPORTANCE` | Will break downstream tools or DANDI upload | **Must fix before proceeding** | +| `BEST_PRACTICE_VIOLATION` | Violates NWB best practices | **Fix all of these** | +| `BEST_PRACTICE_SUGGESTION` | Could be improved | Fix if straightforward, otherwise note for the user | +| `PYNWB_VALIDATION` | PyNWB schema violations | **Must fix before proceeding** | + +**For each issue reported, you must:** +1. Identify the root cause in the conversion code +2. Fix the code (metadata, interface, or convert_session.py) +3. Re-run the conversion (stub_test=True) +4. Re-run nwbinspector to confirm the fix + +**Common issues and their fixes:** + +| Inspector Message | Fix | +|-------------------|-----| +| `check_session_start_time_old_date` | Session start time is wrong or default — extract real date from source files | +| `check_session_start_time_future_date` | Timezone conversion error — verify ZoneInfo usage | +| `check_missing_text_for_session_description` | Add `session_description` to metadata.yaml or set it in convert_session.py | +| `check_subject_species_latin_binomial` | Use "Mus musculus" not "mouse", "Rattus norvegicus" not "rat" | +| `check_subject_species_form` | Species should be binomial (e.g., "Mus musculus") | +| `check_subject_age` | Format as ISO 8601 duration: "P90D" not "90 days" | +| `check_subject_sex` | Must be one of: "M", "F", "U", "O" | +| `check_data_orientation` | Time should be the first dimension. Transpose data if needed | +| `check_timestamps_match_first_dimension` | Length of timestamps must equal first dim of data | +| `check_regular_timestamps` | If data has constant rate, use `rate` + `starting_time` instead of `timestamps` | +| `check_timestamp_of_the_first_sample_is_not_negative` | Timestamps should start >= 0. Adjust offset | +| `check_missing_unit` | TimeSeries must have `unit` specified | +| `check_resolution` | Set resolution=-1.0 if unknown, otherwise provide actual resolution | +| `check_electrodes_table_global_ids_are_not_unique` | Electrode IDs must be unique across all probes | +| `check_empty_string_for_*` | Replace empty strings with actual descriptions | +| `check_imaging_plane_excitation_lambda` | Set `excitation_lambda` on ImagingPlane in metadata | +| `check_imaging_plane_indicator` | Set `indicator` on ImagingPlane (e.g., "GCaMP6f") | +| `check_imaging_plane_location` | Set `location` on ImagingPlane (e.g., "CA1") | +| `check_rate_is_not_zero` | TwoPhotonSeries must have nonzero `rate` — check Suite2p ops["fs"] | +| `check_plane_segmentation_image_mask_shape` | ROI masks must match imaging plane dimensions | +| `check_spatial_series_dims` | SpatialSeries must have 1, 2, or 3 data columns only | +| `check_compass_direction_unit` | CompassDirection SpatialSeries must use "degrees" or "radians" | +| `check_image_series_data_size` | Animal behavior videos should use external_file, not internal storage | +| `check_image_series_external_file_relative` | External file paths must be relative, not absolute | +| `check_no_empty_string_for_*` | All text fields (description, unit) must be non-empty | +| `check_timestamps_without_nans` | Timestamps must not contain NaN values | +| `check_timestamps_ascending` | Timestamps must be sorted in ascending order | +| `check_negative_spike_times` | All spike times must be >= 0 (session-aligned, not trial-aligned) | +| `check_ascending_spike_times` | Spike times within each unit must be in ascending order | +| `check_subject_exists` | NWBFile must have a Subject object | +| `check_subject_id_exists` | Subject must have subject_id set (required for DANDI) | +| `check_electrode_location` | Electrode location column must be filled (use "unknown" if needed) | + +**Also run `dandi validate` if the user plans to upload to DANDI:** + +```bash +dandi validate /path/to/output/ +``` + +This catches DANDI-specific requirements beyond nwbinspector: +- `subject_id` must be set +- `session_id` must be set +- File naming conventions for DANDI organize + +**Keep iterating until nwbinspector produces zero CRITICAL and zero BEST_PRACTICE_VIOLATION messages.** +Show the user the final clean nwbinspector output as confirmation. + +### Step 5: Run Full Conversion (one session) + +Once stub_test passes and nwbinspector is clean, run with `stub_test=False` on a single session: + +```python +session_to_nwb( + data_dir_path="/path/to/sample/session", + output_dir_path="/path/to/output", + stub_test=False, +) +``` + +Then run nwbinspector again on the full output — some issues only appear with real data +(e.g., data orientation problems, timestamp gaps, large uncompressed datasets). + +### Step 6: Validate Data Integrity + +For critical data streams, compare source and NWB values: + +```python +import numpy as np + +# Example: verify spike times +with NWBHDF5IO("output.nwb", "r") as io: + nwbfile = io.read() + nwb_spike_times = nwbfile.units["spike_times"][0] + +# Compare with source +import spikeinterface.extractors as se +sorting = se.read_phy(phy_path) +source_spike_times = sorting.get_unit_spike_train(unit_id=0, return_times=True) + +assert np.allclose(nwb_spike_times, source_spike_times, atol=1e-6) +``` + +### Step 7: Iterate + +If any issues are found: +1. Fix the issue in the conversion code +2. Re-run the stub test +3. Re-run nwbinspector — confirm zero CRITICAL/BEST_PRACTICE_VIOLATION +4. Re-run full conversion +5. Re-validate +6. Repeat until clean + +### Common Debugging Patterns + +**Interface won't instantiate:** +- Check that file paths in source_data are correct +- Check that the file format is what you think it is +- Try instantiating the interface in isolation + +**Data shapes are wrong:** +- Print the data shape at each step of custom interface +- Check if axes need to be transposed +- Check if time is first dimension (NWB convention) + +**Timestamps don't make sense:** +- Check if timestamps are in seconds (NWB convention) +- Check timezone handling +- Print first/last timestamps and compare with expected session duration + +**Metadata schema validation fails:** +- Print the metadata dict and compare with schema +- Check for required fields that are None or empty +- Check types (datetime vs string, list vs single value) + +### Push Phase 6 Results + +After all tests pass and nwbinspector is clean, commit any bug fixes and push: +```bash +git add -A +git commit -m "Phase 6: testing and validation — all checks passing + +nwbinspector: 0 CRITICAL, 0 BEST_PRACTICE_VIOLATION +dandi validate: passed" +if git remote get-url origin &>/dev/null; then git push; fi +``` diff --git a/src/pyflask/ai/skill/phases/07-dandi-upload.md b/src/pyflask/ai/skill/phases/07-dandi-upload.md new file mode 100644 index 000000000..b8a312371 --- /dev/null +++ b/src/pyflask/ai/skill/phases/07-dandi-upload.md @@ -0,0 +1,913 @@ +## Phase 7: DANDI Upload + +**Goal**: Upload validated NWB files to the DANDI Archive for public sharing. + +**Entry**: All NWB files are converted, validated with nwbinspector, and ready for sharing. + +**Exit criteria**: Data is uploaded to DANDI, organized correctly, and accessible via the Dandiset URL. + +### Step 0: Choose DANDI Instance + +**Always ask this first.** Before any upload steps, ask the user which DANDI instance to use: + +> We're ready to upload your NWB files to DANDI! First, which DANDI instance would you +> like to use? +> +> 1. **DANDI Sandbox** (gui-staging.dandiarchive.org) — for testing. Data can be deleted. +> Use this if you want to verify everything works before publishing for real. +> 2. **DANDI Archive** (dandiarchive.org) — the official public archive. Use this when +> you're ready to publish your data permanently. +> +> Which would you prefer? + +Set the instance URL based on their choice: +- **Sandbox**: `DANDI_INSTANCE_URL=https://gui-staging.dandiarchive.org` + and `DANDI_API_URL=https://api-staging.dandiarchive.org/api` +- **Archive**: use the defaults (no env vars needed) + +For sandbox uploads, add `-i dandi-staging` to all `dandi` CLI commands. + +### Prerequisites + +Before uploading, the user needs: +1. A DANDI account (on the chosen instance — sandbox and archive have separate accounts) +2. A DANDI API key (from user profile on the chosen instance) +3. A Dandiset created on the chosen instance (or you help them create one) +4. The `dandi` CLI installed (`pip install -U dandi`) + +### Step 1: Create a Dandiset + +Guide the user through creating a Dandiset on the DANDI Archive: + +> Before we upload, we need to create a Dandiset on DANDI Archive. Have you already +> created one? If not, here's how: +> +> 1. Go to https://dandiarchive.org and log in (or create an account) +> 2. Click "New Dandiset" in the top right +> 3. Fill in the metadata: +> - **Name**: A descriptive title for your dataset +> - **Description**: Abstract or summary of the dataset +> - **License**: Usually CC-BY-4.0 for open data +> - **Contributors**: Add all contributors with their ORCID IDs +> 4. Note the 6-digit Dandiset ID (e.g., "000123") + +If the data should be embargoed (not publicly visible yet): +> If your data needs to be embargoed (e.g., pending publication), select the +> embargo option when creating the Dandiset. Embargoed data is only visible +> to Dandiset owners until you release it. + +### Step 2: Set Up API Key + +```bash +# Get your API key from https://dandiarchive.org (click your initials → API Key) +export DANDI_API_KEY= +``` + +> You'll need your DANDI API key. Go to https://dandiarchive.org, click your +> initials in the top right, and copy your API key. Then set it as an environment +> variable: +> ```bash +> export DANDI_API_KEY=your_key_here +> ``` + +### Step 3: Validate Before Upload + +Run `dandi validate` on the NWB files before uploading: + +```bash +dandi validate /path/to/nwb/output/ +``` + +This checks for DANDI-specific requirements beyond what nwbinspector catches: +- File naming conventions +- Required metadata fields (subject_id, session_id) +- NWB file structure compliance + +Fix any validation errors before proceeding. + +### Step 4: Upload Using NeuroConv Helper (Recommended) + +NeuroConv provides `automatic_dandi_upload()` which handles download, organize, and upload: + +```python +from neuroconv.tools.data_transfers import automatic_dandi_upload + +automatic_dandi_upload( + dandiset_id="000123", # 6-digit Dandiset ID + nwb_folder_path="./nwb_output", # Folder with all NWB files + sandbox=False, # True for testing on sandbox server + number_of_jobs=1, # Parallel upload jobs + number_of_threads=4, # Threads per upload +) +``` + +This function: +1. Downloads the Dandiset metadata (creates the local Dandiset structure) +2. Runs `dandi organize` to rename files to DANDI conventions (sub-/sub-_ses-.nwb) +3. Uploads all organized NWB files + +### Step 5: Upload Using DANDI CLI (Alternative) + +If the NeuroConv helper doesn't work, use the DANDI CLI directly: + +```bash +# 1. Download the Dandiset structure +dandi download https://dandiarchive.org/dandiset/000123/draft +cd 000123 + +# 2. Organize NWB files into DANDI structure (renames files) +dandi organize /path/to/nwb/output/ -f dry # Preview first +dandi organize /path/to/nwb/output/ # Execute + +# 3. Validate +dandi validate . + +# 4. Upload +dandi upload +``` + +### Step 5b: Upload Using DANDI Python API (Alternative) + +If the CLI approaches have issues (e.g., sandbox identifier format), use the Python API directly: + +```python +from pathlib import Path +from dandi.dandiapi import DandiAPIClient + +client = DandiAPIClient.from_environ() # or DandiAPIClient(api_url="https://api.sandbox.dandiarchive.org/api") +client.dandi_authenticate() +dandiset = client.get_dandiset("000123", "draft") + +# Upload each organized NWB file +# NOTE: iter_upload_raw_asset() is on the RemoteDandiset object, NOT on DandiAPIClient +nwb_dir = Path("./000123") +for nwb_path in sorted(nwb_dir.rglob("*.nwb")): + asset_path = str(nwb_path.relative_to(nwb_dir)) + print(f"Uploading {asset_path}...") + for status in dandiset.iter_upload_raw_asset(nwb_path, asset_metadata={"path": asset_path}): + if isinstance(status, dict) and status.get("status") == "done": + print(f" Done: {status['asset'].path}") +``` + +**DANDI sandbox URL**: Always use `https://api.sandbox.dandiarchive.org/api` for the +sandbox. The older `api-staging.dandiarchive.org` URL redirects and strips auth headers, +causing 401 errors on write operations. + +### Step 6: Verify on DANDI + +After upload completes: +> Your data is now on DANDI! You can view it at: +> https://dandiarchive.org/dandiset/000123/draft +> +> Please verify: +> 1. All sessions appear in the file listing +> 2. The metadata looks correct +> 3. You can stream and preview the NWB files in Neurosift +> +> When you're ready to publish (make it permanently citable with a DOI), +> click "Publish" on the Dandiset page. This creates an immutable version. + +### Step 7: Edit Dandiset Metadata + +After uploading, programmatically populate the Dandiset metadata using the DANDI API. +If there is an associated manuscript, use OpenAlex to auto-populate contributors, funders, +and affiliations. + +> Now let's complete your Dandiset metadata so it's ready for publication. +> Is there an associated publication or preprint? If so, please share the DOI +> (e.g., `10.1038/s41586-023-06031-6`). + +#### 7a. Fetch Structured Data from OpenAlex + +If the user provides a DOI, query OpenAlex to get authors, ORCIDs, affiliations, ROR IDs, +and funding info: + +```python +import requests + +doi = "10.1038/s41467-023-43250-x" # user-provided +response = requests.get(f"https://api.openalex.org/works/doi:{doi}") +work = response.json() + +# Title +title = work["title"] + +# Authors with ORCIDs, affiliations, and ROR IDs +for authorship in work["authorships"]: + author = authorship["author"] + name = author["display_name"] # e.g., "Steffen Schneider" + orcid = author.get("orcid") # e.g., "https://orcid.org/0000-0003-2327-6459" + is_corresponding = authorship["is_corresponding"] + for inst in authorship.get("institutions", []): + inst_name = inst["display_name"] # e.g., "Columbia University" + inst_ror = inst.get("ror") # e.g., "https://ror.org/00hj8s172" + +# Funders with ROR IDs and award numbers +# NOTE: OpenAlex grants are often empty — check the paper's acknowledgments section +# and ask the user to confirm funding information +for grant in work.get("grants", []): + funder_name = grant["funder_display_name"] # e.g., "National Institute of Mental Health" + funder_ror = grant.get("funder", {}).get("ror") # e.g., "https://ror.org/04xeg9z08" + award_id = grant.get("funder_award_id") # e.g., "R21MH117788" +``` + +**OpenAlex data quality warnings:** +- Some authors have **null ORCIDs** — only add `identifier` to the DANDI contributor + when an ORCID actually exists. Do not set it to `null` or empty string. +- The `grants` array is **often empty** even for well-funded papers — always cross-reference + the paper's acknowledgments section and ask the user. +- OpenAlex may list **extra institutional affiliations** (historical or secondary) that + don't match the paper. Include all but flag them for the user to review. + +Present the extracted data to the user for confirmation: + +> I found the following from OpenAlex for your paper "{title}": +> +> **Authors:** +> 1. Last, First (ORCID: 0000-...) — Institution (ROR: ...) +> 2. ... +> +> **Funding:** +> 1. Agency Name — Award: XYZ123 (ROR: ...) +> +> Does this look correct? Should I add or remove anyone? Who should be the contact person? + +#### 7b. Validate Identifiers + +Before applying any metadata, validate all ORCID and ROR identifiers against their +respective APIs to prevent bad data from being committed: + +```python +def validate_orcid(orcid: str) -> bool: + """Validate ORCID exists. orcid should be bare ID like '0000-0001-2345-6789'.""" + resp = requests.head( + f"https://pub.orcid.org/v3.0/{orcid}", + headers={"Accept": "application/json"}, + ) + return resp.status_code == 200 + +def validate_ror(ror_url: str) -> bool: + """Validate ROR ID exists. ror_url like 'https://ror.org/01cwqze88'. + + NOTE: ROR API v2 changed the response schema — org name is in + org["names"][0]["value"], not org["name"]. Some OpenAlex ROR IDs + may be stale (return 404) due to organization mergers. + """ + ror_id = ror_url.replace("https://ror.org/", "") + resp = requests.get(f"https://api.ror.org/v2/organizations/{ror_id}") + return resp.status_code == 200 +``` + +Run validation on all extracted identifiers and warn the user about any that fail: + +```python +for authorship in work["authorships"]: + orcid = authorship["author"].get("orcid", "").replace("https://orcid.org/", "") + if orcid and not validate_orcid(orcid): + print(f"WARNING: ORCID {orcid} for {authorship['author']['display_name']} not found") + + for inst in authorship.get("institutions", []): + ror = inst.get("ror") + if ror and not validate_ror(ror): + print(f"WARNING: ROR {ror} for {inst['display_name']} not found") +``` + +#### 7c. Look Up Ontology Terms for the `about` Field + +Use the EBI Ontology Lookup Service (OLS4) to find proper ontology identifiers for brain +regions, disorders, and cell types. Never guess or fabricate ontology identifiers. + +```python +def lookup_ontology_term(term: str, ontology: str = "uberon") -> list[dict]: + """Search EBI OLS4 for an ontology term. + + ontology: 'uberon' (anatomy), 'doid' (disease), 'cl' (cell type) + """ + resp = requests.get( + "https://www.ebi.ac.uk/ols4/api/search", + params={"q": term, "ontology": ontology, "rows": "5", "queryFields": "label,synonym"}, + ) + results = resp.json().get("response", {}).get("docs", []) + return [{"label": r["label"], "iri": r["iri"], "obo_id": r.get("obo_id")} for r in results] + +# Example: look up "hippocampus" +terms = lookup_ontology_term("hippocampus", "uberon") +# → [{"label": "hippocampal formation", "iri": "http://purl.obolibrary.org/obo/UBERON_0002421", +# "obo_id": "UBERON:0002421"}, ...] +``` + +**OLS4 search pitfalls — always use exact label matching:** + +OLS4 often returns sub-regions or synonyms instead of the term you want: +- Searching "primary motor cortex" may return "primary motor cortex layer 6" as the top result +- Searching "secondary motor cortex" may return "premotor cortex" (a synonym with the same UBERON ID) +- Searching "dorsomedial striatum" returns unrelated terms — search for "dorsal striatum" instead + +**Always iterate through results and match by exact label** (case-insensitive) before +falling back to the first result: + +```python +def lookup_ontology_term_exact(term, ontology="uberon"): + """Search OLS4 with exact label matching.""" + results = lookup_ontology_term(term, ontology) + # Prefer exact label match + for r in results: + if r["label"].lower() == term.lower(): + return r + # Fall back to first result if no exact match + return results[0] if results else None +``` + +**Maintain a fallback table** for commonly used terms where OLS4 search is unreliable: + +```python +UBERON_FALLBACKS = { + "primary visual cortex": {"label": "primary visual cortex", "obo_id": "UBERON:0002436", + "iri": "http://purl.obolibrary.org/obo/UBERON_0002436"}, + "secondary visual cortex": {"label": "secondary visual cortex", "obo_id": "UBERON:0022232", + "iri": "http://purl.obolibrary.org/obo/UBERON_0022232"}, + "primary motor cortex": {"label": "primary motor cortex", "obo_id": "UBERON:0001384", + "iri": "http://purl.obolibrary.org/obo/UBERON_0001384"}, + "secondary motor cortex": {"label": "secondary motor cortex", "obo_id": "UBERON:0016634", + "iri": "http://purl.obolibrary.org/obo/UBERON_0016634"}, + "primary somatosensory cortex": {"label": "primary somatosensory cortex", "obo_id": "UBERON:0008933", + "iri": "http://purl.obolibrary.org/obo/UBERON_0008933"}, + "dorsal striatum": {"label": "dorsal striatum", "obo_id": "UBERON:0005382", + "iri": "http://purl.obolibrary.org/obo/UBERON_0005382"}, + "nucleus accumbens": {"label": "nucleus accumbens", "obo_id": "UBERON:0001882", + "iri": "http://purl.obolibrary.org/obo/UBERON_0001882"}, +} +``` + +Present results to the user and add confirmed terms to `about`: +```python +metadata["about"] = [ + { + "schemaKey": "Anatomy", + "name": "hippocampal formation", + "identifier": "UBERON:0002421", + }, +] +``` + +Supported ontology → `schemaKey` mapping: +| Ontology | `schemaKey` | Use for | +|----------|-------------|---------| +| UBERON | `Anatomy` | Brain regions, anatomical structures | +| DOID | `Disorder` | Diseases, disorders | +| CL | `Anatomy` | Cell types | +| HP | `Disorder` | Human phenotypes | + +#### 7d. Build the Metadata and Set via DANDI API + +Use the `dandi` Python client to programmatically update the Dandiset metadata. + +**IMPORTANT**: Never call `set_raw_metadata()` directly — it accepts invalid metadata silently. +Always use this `validate_and_save` wrapper that validates against the DANDI JSON schema first: + +```python +import requests, jsonschema +from dandi.dandiapi import DandiAPIClient + +_schema_cache = {} + +def validate_and_save(dandiset, metadata): + """Validate metadata against the canonical DANDI JSON schema, then save. + + Raises ValueError if metadata is invalid. Uses the official schema from + https://github.com/dandi/schema (not dandischema.models.model_json_schema(), + which has Pydantic v2 generation bugs with anyOf/type conflicts). + """ + version = metadata.get("schemaVersion", "0.7.0") + if version not in _schema_cache: + url = f"https://raw.githubusercontent.com/dandi/schema/refs/heads/master/releases/{version}/dandiset.json" + _schema_cache[version] = requests.get(url).json() + schema = _schema_cache[version] + + validator = jsonschema.Draft202012Validator(schema) + errors = sorted(validator.iter_errors(metadata), key=lambda e: list(e.absolute_path)) + if errors: + print(f"Schema validation FAILED ({len(errors)} errors):") + for err in errors: + path = ".".join(str(p) for p in err.absolute_path) + print(f" {path}: {err.message}") + raise ValueError("Fix validation errors before saving") + + dandiset.set_raw_metadata(metadata) + print("Metadata validated and saved!") + +client = DandiAPIClient.from_environ() # uses DANDI_API_KEY env var +dandiset = client.get_dandiset("000123", "draft") +metadata = dandiset.get_raw_metadata() +``` + +**Schema validation approach**: Always start from `dandiset.get_raw_metadata()` which +includes server-generated fields (`id`, `citation`, `assetsSummary`, `manifestLocation`). +Mutate only the fields you control (name, description, contributors, etc.), then validate +the **complete** metadata dict. Do NOT strip server-generated fields before validation — +they are required by the schema. + +**Set title and description:** +```python +metadata["name"] = title # from OpenAlex or user +metadata["description"] = description # paper abstract or user-provided +metadata["keywords"] = ["hippocampus", "electrophysiology", "place cells"] # user-provided +``` + +**Set contributors (persons):** +Convert OpenAlex author names from "First Last" to "Last, First" format. Mark the +corresponding author as ContactPerson. Mark all authors with `includeInCitation: True`. + +```python +contributors = [] +for authorship in work["authorships"]: + author = authorship["author"] + display_name = author["display_name"] + # Convert "First Last" → "Last, First" + parts = display_name.rsplit(" ", 1) + dandi_name = f"{parts[-1]}, {parts[0]}" if len(parts) == 2 else display_name + + orcid = author.get("orcid", "").replace("https://orcid.org/", "") + roles = ["dcite:Author"] + if authorship["is_corresponding"]: + roles.append("dcite:ContactPerson") + + person = { + "schemaKey": "Person", + "name": dandi_name, + "roleName": roles, + "includeInCitation": True, + } + if orcid: + person["identifier"] = orcid + # Add email for contact person (ask user) + if authorship["is_corresponding"]: + person["email"] = contact_email # must ask user for this + + # Add affiliation — IMPORTANT: schemaKey must be "Affiliation", not "Organization" + # "Organization" is for top-level contributors (funders); "Affiliation" is for person affiliations + affiliations = [] + for inst in authorship.get("institutions", []): + aff = { + "schemaKey": "Affiliation", + "name": inst["display_name"], + } + if inst.get("ror"): + aff["identifier"] = inst["ror"] + affiliations.append(aff) + if affiliations: + person["affiliation"] = affiliations + + contributors.append(person) +``` + +**Add data curators (the people who performed the conversion):** + +Data curators are NOT authors — they get `dcite:DataCurator` role only, and +`includeInCitation: False` unless they made intellectual contributions to the dataset. + +```python +# Add each person who worked on the NWB conversion +contributors.append({ + "schemaKey": "Person", + "name": "Last, First", # person who ran the conversion + "identifier": "0000-0001-2345-6789", # their ORCID + "roleName": ["dcite:DataCurator"], + "includeInCitation": False, + "email": "curator@example.com", + "affiliation": [{"schemaKey": "Affiliation", "name": "CatalystNeuro"}], +}) +``` + +**Add funders as Organization contributors:** +```python +for grant in work.get("grants", []): + funder = { + "schemaKey": "Organization", + "name": grant["funder_display_name"], + "roleName": ["dcite:Funder"], + "includeInCitation": False, + } + if grant.get("funder", {}).get("ror"): + funder["identifier"] = grant["funder"]["ror"] + if grant.get("funder_award_id"): + funder["awardNumber"] = grant["funder_award_id"] + contributors.append(funder) +``` + +**Set contributors on metadata:** +```python +metadata["contributor"] = contributors +``` + +**Add related resources:** +```python +related = [] + +# Associated publication +related.append({ + "schemaKey": "Resource", + "identifier": f"doi:{doi}", + "url": f"https://doi.org/{doi}", + "name": title, + "relation": "dcite:IsDescribedBy", + "resourceType": "dcite:JournalArticle", # or dcite:Preprint +}) + +# Conversion code repo (if on GitHub) +related.append({ + "schemaKey": "Resource", + "url": "https://github.com/catalystneuro/lab-to-nwb", + "name": "NWB conversion code", + "relation": "dcite:IsSupplementedBy", + "resourceType": "dcite:Software", +}) + +metadata["relatedResource"] = related +``` + +**Add ontology terms to `about` (from 7c results):** +```python +metadata["about"] = [ + {"schemaKey": "Anatomy", "name": "hippocampal formation", "identifier": "UBERON:0002421"}, + # add more terms as appropriate for the experiment +] +``` + +**Add ethics approval (ask user):** +```python +metadata["ethicsApproval"] = [{ + "schemaKey": "EthicsApproval", + "identifier": "IACUC Protocol #12345", # ask user + "contactPoint": { + "schemaKey": "ContactPoint", + "name": "Columbia University IACUC", # ask user + }, +}] +``` + +**Set license and access:** +```python +metadata["license"] = ["spdx:CC-BY-4.0"] +metadata["access"] = [{ + "schemaKey": "AccessRequirements", + "status": "dandi:OpenAccess", +}] +``` + +**Validate and save (uses the wrapper defined above — never call `set_raw_metadata` directly):** +```python +validate_and_save(dandiset, metadata) +``` + +#### 7e. Metadata Quality Checklist + +Before saving, verify the metadata covers all quality criteria: + +- [ ] Is the title descriptive and publication-quality? +- [ ] Does the description mention data modalities and recording methods? +- [ ] Does the description include a brief methodology summary? +- [ ] Are associated publications linked with DOIs and correct relation (`dcite:IsDescribedBy`)? +- [ ] Are all paper authors listed as contributors with ORCIDs? +- [ ] Do contributors have institutional affiliations with ROR identifiers? +- [ ] Are funders listed with award numbers and ROR identifiers? +- [ ] Are relevant brain regions / anatomical structures in the `about` field (UBERON)? +- [ ] Is the license specified (`spdx:CC-BY-4.0`)? +- [ ] Is the IACUC/IRB protocol number included in `ethicsApproval`? +- [ ] Are keywords provided for discoverability? +- [ ] Is at least one contributor marked as `dcite:ContactPerson` with an email? + +#### 7f. Additional Metadata to Ask the User + +After auto-populating from OpenAlex, ask the user for anything that can't be extracted: + +> I've populated the metadata from your paper. A few more things: +> +> 1. **Contact person email**: What email should be listed for the contact person? +> 2. **Ethics approval**: What is your IACUC/IRB protocol number and institution? +> 3. **Keywords**: What keywords should I add for discoverability? +> 4. **Brain regions**: What brain regions were recorded? I'll look up the UBERON terms. +> 5. **Any additional contributors** not on the paper (e.g., data curators, technicians)? + +#### Publishing + +> When all metadata is complete and you're ready to make your dataset permanently citable: +> 1. Review the metadata at your Dandiset URL +> 2. Click "Publish" on the Dandiset page +> 3. This creates an immutable version with a DOI +> 4. The DOI can be used in publications to reference this exact version of the data +> +> Note: You can continue uploading files and publish new versions later. Each version +> gets its own DOI. + +### Step 8: Set Asset-Level Metadata (Brain Region per Subject) + +After uploading and setting dandiset-level metadata, set per-asset metadata — particularly +brain region when it varies across subjects or sessions. DANDI assets support an `about` +field (same schema as dandiset-level) that can hold `Anatomy` terms per file. + +#### 8a. Build a Subject → Brain Region Mapping + +Ask the user which brain regions each subject was recorded from. Often this is already +known from Phase 3 metadata collection or from the NWB files themselves: + +> Different subjects may have implants in different brain regions. Can you tell me +> which brain region(s) each subject was recorded from? For example: +> - Subject A001: CA1 +> - Subject A002: V1, LM +> - Subject A003: mPFC + +Or extract it programmatically from the NWB files if `electrodes.location` or +`ImagingPlane.location` is set: + +```python +from pynwb import NWBHDF5IO +from pathlib import Path + +subject_regions = {} +for nwb_path in sorted(Path("./000123").rglob("*.nwb")): + with NWBHDF5IO(str(nwb_path), "r") as io: + nwbfile = io.read() + subject_id = nwbfile.subject.subject_id if nwbfile.subject else None + regions = set() + + # From electrodes table + if nwbfile.electrodes and "location" in nwbfile.electrodes.colnames: + for loc in nwbfile.electrodes["location"].data[:]: + if loc and loc != "unknown": + regions.add(loc) + + # From imaging planes + if "ophys" in nwbfile.processing: + for container in nwbfile.processing["ophys"].data_interfaces.values(): + if hasattr(container, "imaging_plane"): + loc = container.imaging_plane.location + if loc and loc != "unknown": + regions.add(loc) + + if subject_id and regions: + subject_regions[subject_id] = list(regions) + +print(subject_regions) +# e.g., {"C005": ["nucleus accumbens"], "C015": ["nucleus accumbens", "ventral tegmental area"]} +``` + +#### 8b. Look Up UBERON Terms + +Use the same `lookup_ontology_term` function from Step 7c to resolve brain region names +to UBERON identifiers. **Use full OBO URIs** (not compact CURIEs like `UBERON:0002421`) +because the DANDI asset schema requires `"format": "uri"` on identifiers. + +Present results to the user for confirmation: + +```python +region_to_uberon = {} +for regions in subject_regions.values(): + for region in regions: + if region not in region_to_uberon: + terms = lookup_ontology_term(region, "uberon") + if terms: + best = terms[0] + region_to_uberon[region] = { + "schemaKey": "Anatomy", + "name": best["label"], + "identifier": best["iri"], # Full OBO URI, e.g., "http://purl.obolibrary.org/obo/UBERON_0012171" + } +``` + +#### 8c. Apply Brain Region to Each Asset + +Use the DANDI REST API directly to update each asset's `about` field. The workflow +is: list assets → GET metadata → update `about` → PUT back with `blob_id`. + +**Note**: Each PUT creates a new asset version with a new `asset_id`. + +```python +import requests + +DANDI_API = "https://api.dandiarchive.org/api" # or sandbox +HEADERS = {"Authorization": f"token {api_key}", "Content-Type": "application/json"} +DANDISET_ID = "000123" + +# List all assets +resp = requests.get(f"{DANDI_API}/dandisets/{DANDISET_ID}/versions/draft/assets/", headers=HEADERS) +assets = resp.json()["results"] + +for asset_info in assets: + asset_id = asset_info["asset_id"] + blob_id = asset_info["blob"] + path = asset_info["path"] + + # Extract subject_id from path (e.g., "sub-C005/sub-C005_ses-xxx.nwb") + subject_id = path.split("/")[0].replace("sub-", "") if path.startswith("sub-") else None + if not subject_id or subject_id not in subject_regions: + continue + + # Build anatomy entries for this subject + about = [region_to_uberon[r] for r in subject_regions[subject_id] if r in region_to_uberon] + if not about: + continue + + # GET current asset metadata + meta_resp = requests.get(f"{DANDI_API}/assets/{asset_id}/", headers=HEADERS) + metadata = meta_resp.json() + metadata["about"] = about + + # PUT updated metadata + put_resp = requests.put( + f"{DANDI_API}/dandisets/{DANDISET_ID}/versions/draft/assets/{asset_id}/", + headers=HEADERS, + json={"metadata": metadata, "blob_id": blob_id}, + ) + if put_resp.status_code == 200: + print(f" {path}: {[a['name'] for a in about]}") + else: + print(f" {path}: FAILED {put_resp.status_code} - {put_resp.text[:200]}") +``` + +If the dandiset has many assets, paginate through them: +```python +url = f"{DANDI_API}/dandisets/{DANDISET_ID}/versions/draft/assets/" +while url: + resp = requests.get(url, headers=HEADERS) + data = resp.json() + for asset_info in data["results"]: + # ... same update logic as above + pass + url = data.get("next") +``` + +#### 8d. Verify Asset Metadata + +Spot-check a few assets to confirm the metadata was saved: + +```python +resp = requests.get(f"{DANDI_API}/dandisets/{DANDISET_ID}/versions/draft/assets/", headers=HEADERS) +for asset_info in resp.json()["results"][:5]: + meta = requests.get(f"{DANDI_API}/assets/{asset_info['asset_id']}/", headers=HEADERS).json() + about = meta.get("about", []) + print(f" {asset_info['path']}: {[a['name'] for a in about] if about else '(none)'}") +``` + +### Testing with Sandbox + +For testing uploads before going to production: + +```python +# Use the sandbox server +automatic_dandi_upload( + dandiset_id="000123", + nwb_folder_path="./nwb_output", + sandbox=True, # Upload to sandbox.dandiarchive.org +) +``` + +Or with the CLI: +```bash +# Get your sandbox API key from https://sandbox.dandiarchive.org/ +export DANDI_API_KEY=your_sandbox_key + +# Upload to sandbox +dandi upload -i dandi-sandbox +``` + +For programmatic metadata editing on the sandbox, use: +```python +from dandi.dandiapi import DandiAPIClient + +client = DandiAPIClient(api_url="https://api.sandbox.dandiarchive.org/api") +client.dandi_authenticate() +dandiset = client.get_dandiset("000123", "draft") +# ... same metadata operations as production +``` + +The sandbox server is at https://sandbox.dandiarchive.org/ (API: https://api.sandbox.dandiarchive.org/) — +create a separate account and Dandiset there for testing. + +### Step 9: Write Conversion Manifest + +After the upload is complete and metadata is set, write a `conversion_manifest.yaml` to the +conversion repo. This manifest captures structured metadata about what was built, enabling +the weekly registry scan to aggregate it for future conversions. + +Build the manifest from the conversion artifacts you've created throughout the engagement: + +```yaml +# conversion_manifest.yaml (in repo root) +schema_version: 1 +lab: "" +conversions: + - name: "" + status: completed + species: "" + modalities: [ecephys, behavior] # from Phase 1 + neuroconv_interfaces: + - name: SpikeGLXRecordingInterface + file_patterns: ["*.ap.bin", "*.ap.meta"] + - name: SpikeGLXLFPInterface + file_patterns: ["*.lf.bin", "*.lf.meta"] + - name: PhySortingInterface + file_patterns: ["spike_times.npy", "cluster_group.tsv"] + custom_interfaces: + - name: "" + file: "src///interfaces/.py" + handles: "" + creates: [Position, BehavioralEvents] # NWB types created + file_patterns: ["events.csv", "trials.csv"] + extensions: [] # any ndx-* extensions used + sync_approach: "" + dandi_id: "<6-digit dandiset ID>" + pattern: "" + lessons: + - "" + date_completed: "" +``` + +**How to populate each field:** +- `name`: The conversion subdirectory name (e.g., `experiment_2026`) +- `modalities`: Collect from the Data Streams table in `conversion_notes.md` +- `neuroconv_interfaces`: From the Interface Mapping table in `conversion_notes.md`. + Each entry has `name` (the interface class) and `file_patterns` (globs that this + interface handles, from Phase 2 inspection). +- `custom_interfaces`: From any custom DataInterface classes you wrote in Phase 5. + Include `file_patterns` for the files each custom interface reads. +- `extensions`: Any `ndx-*` packages used (e.g., `ndx-fiber-photometry`, `ndx-pose`) +- `sync_approach`: From Phase 4 sync plan +- `dandi_id`: The Dandiset ID from this phase +- `lessons`: Anything surprising, non-obvious, or worth knowing for future similar conversions +- `date_completed`: Today's date + +**Commit and push the manifest** (remote was configured in Phase 1 via the API): +```bash +git add conversion_manifest.yaml +git commit -m "Add conversion manifest for registry + +Dandiset: +Modalities: +Interfaces: NeuroConv + custom" +if git remote get-url origin &>/dev/null; then git push; fi +``` + +If the repo is in the `nwb-conversions` org (the normal case when the API is reachable), +the weekly registry scan will find it automatically — no further action needed. + +If working locally (API was unreachable), inform the user: +> The conversion manifest has been saved locally. To include this conversion in the +> registry for future reference, contact CatalystNeuro for assistance. + +### Step 10: Save Conversation History + +Save the Claude Code conversation that produced this conversion into the repo. This +captures every decision, data inspection, question, and code generation step for +full reproducibility. + +```bash +# Find the active Claude Code conversation JSONL (most recently modified) +CONVERSATION=$(ls -t ~/.claude/projects/*/*.jsonl 2>/dev/null | head -1) +if [ -n "$CONVERSATION" ]; then + mkdir -p .claude + cp "$CONVERSATION" .claude/conversation.jsonl + git add .claude/conversation.jsonl + git commit -m "Save Claude Code conversation history" + if git remote get-url origin &>/dev/null; then git push; fi + echo "Saved conversation: $(du -h .claude/conversation.jsonl | cut -f1)" +else + echo "No conversation JSONL found — skipping" +fi +``` + +The conversation file is a JSONL containing the full exchange between the user and Claude +Code, including tool calls, file reads, and data inspection outputs. It can be replayed +to understand exactly how the conversion was built. + +### Common Issues + +- **"Unable to find environment variable DANDI_API_KEY"**: Set the API key with `export DANDI_API_KEY=...` +- **Validation errors**: Run `nwbinspector` and `dandi validate` to identify issues +- **Files too large**: DANDI supports files up to 5TB. Contact DANDI team for datasets >10TB +- **Path too long**: DANDI has a 512-character path limit. Shorten session/subject IDs if needed +- **Organize step fails**: Ensure NWB files have `subject.subject_id` and `session_id` set +- **Upload hangs**: Try with `number_of_jobs=1` and `number_of_threads=1` for debugging. + Check logs at `~/Library/Logs/dandi-cli` (macOS) or `~/.cache/dandi-cli/log` (Linux) + +### Add Upload to convert_all_sessions.py + +Optionally add upload as the final step of batch conversion: + +```python +def dataset_to_nwb( + data_dir_path, + output_dir_path, + dandiset_id=None, + max_workers=1, + stub_test=False, +): + # ... run all conversions ... + + if dandiset_id and not stub_test: + from neuroconv.tools.data_transfers import automatic_dandi_upload + automatic_dandi_upload( + dandiset_id=dandiset_id, + nwb_folder_path=output_dir_path, + ) +``` diff --git a/src/pyflask/ai/skill/tools/fetch_paper.py b/src/pyflask/ai/skill/tools/fetch_paper.py new file mode 100644 index 000000000..7f4888867 --- /dev/null +++ b/src/pyflask/ai/skill/tools/fetch_paper.py @@ -0,0 +1,358 @@ +#!/usr/bin/env python3 +"""Fetch full text of a scientific paper and extract specific information. + +Usage: + python fetch_paper.py [--extract
] [--query ] + +Identifier can be: + - DOI (e.g., 10.1038/s41586-019-1234-5) + - PMID (e.g., 31234567) + - PMC ID (e.g., PMC6789012) + - URL from doi.org, pubmed, pmc, or europepmc + +Examples: + python fetch_paper.py 10.1126/science.aav7893 + python fetch_paper.py 10.1126/science.aav7893 --extract methods + python fetch_paper.py PMC6525101 --extract methods + python fetch_paper.py 31000656 --extract abstract +""" + +import argparse +import json +import re +import sys +from urllib.error import HTTPError, URLError +from urllib.parse import quote +from urllib.request import Request, urlopen + + +def parse_identifier(raw: str) -> dict: + """Parse a DOI, PMID, PMC ID, or URL into a normalized identifier.""" + raw = raw.strip() + + # URL patterns + doi_url = re.match(r"https?://(?:dx\.)?doi\.org/(.+)", raw) + if doi_url: + return {"type": "doi", "id": doi_url.group(1)} + + pubmed_url = re.match(r"https?://(?:www\.)?ncbi\.nlm\.nih\.gov/pubmed/(\d+)", raw) + if pubmed_url: + return {"type": "pmid", "id": pubmed_url.group(1)} + + pmc_url = re.match(r"https?://(?:www\.)?ncbi\.nlm\.nih\.gov/pmc/articles/(PMC\d+)", raw) + if not pmc_url: + pmc_url = re.match(r"https?://pmc\.ncbi\.nlm\.nih\.gov/articles/(PMC\d+)", raw) + if pmc_url: + return {"type": "pmc", "id": pmc_url.group(1)} + + europepmc_url = re.match(r"https?://europepmc\.org/article/(\w+)/(\d+)", raw) + if europepmc_url: + return {"type": europepmc_url.group(1).lower(), "id": europepmc_url.group(2)} + + # Raw identifiers + if raw.upper().startswith("PMC"): + return {"type": "pmc", "id": raw.upper()} + if raw.isdigit() and len(raw) >= 7: + return {"type": "pmid", "id": raw} + if "/" in raw: + return {"type": "doi", "id": raw} + + return {"type": "unknown", "id": raw} + + +def fetch_url(url: str, accept: str = "application/json") -> str: + """Fetch a URL and return the response text.""" + req = Request(url, headers={"Accept": accept, "User-Agent": "NWB-GUIDE/1.0"}) + with urlopen(req, timeout=30) as resp: + return resp.read().decode("utf-8") + + +def resolve_ids(identifier: dict) -> dict: + """Resolve any identifier to DOI, PMID, and PMC ID using NCBI converter.""" + id_val = identifier["id"] + + if identifier["type"] == "pmc": + id_val = identifier["id"].replace("PMC", "") + query_id = f"PMC{id_val}" + else: + query_id = id_val + + url = f"https://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0/?tool=nwbguide&format=json&ids={quote(query_id)}" + try: + data = json.loads(fetch_url(url)) + records = data.get("records", []) + if records and records[0].get("status") != "error": + r = records[0] + return { + "doi": r.get("doi"), + "pmid": str(r["pmid"]) if "pmid" in r else None, + "pmcid": r.get("pmcid"), + } + except Exception: + pass + + # Return what we have + result = {"doi": None, "pmid": None, "pmcid": None} + result[identifier["type"]] = identifier["id"] + return result + + +def fetch_bioc_fulltext(pmcid: str) -> dict | None: + """Fetch full text via NCBI BioC API (best for open access papers). + + Returns parsed sections dict or None. + """ + numeric = pmcid.replace("PMC", "") + url = f"https://www.ncbi.nlm.nih.gov/research/bionlp/RESTful/pmcoa.cgi/BioC_json/PMC{numeric}/unicode" + try: + data = json.loads(fetch_url(url)) + except Exception: + return None + + sections = {} + documents = data if isinstance(data, list) else [data] + + for doc in documents: + for passage in doc.get("documents", [{}])[0].get("passages", []): + infons = passage.get("infons", {}) + sec_type = infons.get("section_type", "").lower() + text = passage.get("text", "") + + if not text.strip(): + continue + + # Normalize section names + if sec_type in ("title",): + key = "title" + elif sec_type in ("abstract",): + key = "abstract" + elif sec_type in ("intro", "introduction"): + key = "introduction" + elif sec_type in ("methods", "materials", "materials and methods", "experimental"): + key = "methods" + elif sec_type in ("results", "results and discussion"): + key = "results" + elif sec_type in ("discuss", "discussion"): + key = "discussion" + elif sec_type in ("suppl", "supplementary", "supplementary material"): + key = "supplementary" + elif sec_type in ("ack", "acknowledgements", "acknowledgments", "funding"): + key = "acknowledgements" + elif sec_type in ("ref", "references"): + continue # skip references + elif "data" in sec_type and "avail" in sec_type: + key = "data_availability" + elif sec_type in ("fig", "fig_title_caption", "table", "table_title_caption"): + key = "figures_tables" + elif sec_type: + key = sec_type.replace(" ", "_")[:40] + else: + key = "body" + + if key in sections: + sections[key] += "\n" + text + else: + sections[key] = text + + return sections if sections else None + + +def fetch_pubmed_abstract(pmid: str) -> dict | None: + """Fetch abstract from PubMed E-utilities as fallback.""" + import xml.etree.ElementTree as ET + + url = f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&id={pmid}&rettype=xml" + try: + xml_text = fetch_url(url, accept="text/xml") + root = ET.fromstring(xml_text) + + sections = {} + + # Title + title_el = root.find(".//ArticleTitle") + if title_el is not None and title_el.text: + sections["title"] = title_el.text + + # Abstract + abstract_parts = [] + for abs_el in root.findall(".//AbstractText"): + label = abs_el.get("Label", "") + text = "".join(abs_el.itertext()) + if label: + abstract_parts.append(f"{label}: {text}") + else: + abstract_parts.append(text) + if abstract_parts: + sections["abstract"] = "\n".join(abstract_parts) + + # Keywords + kw = [el.text for el in root.findall(".//Keyword") if el.text] + if kw: + sections["keywords"] = ", ".join(kw) + + # Journal + journal_el = root.find(".//Journal/Title") + if journal_el is not None and journal_el.text: + sections["journal"] = journal_el.text + + return sections if sections else None + except Exception: + return None + + +def fetch_europepmc_abstract(identifier: dict) -> dict | None: + """Search Europe PMC and return article metadata + abstract.""" + id_type = identifier["type"] + id_val = identifier["id"] + + if id_type == "doi": + query = f'DOI:"{id_val}"' + elif id_type == "pmid": + query = f"EXT_ID:{id_val} AND SRC:MED" + elif id_type == "pmc": + query = f"PMCID:{id_val}" + else: + query = id_val + + url = f"https://www.ebi.ac.uk/europepmc/webservices/rest/search?query={quote(query)}&format=json&resultType=core&pageSize=1" + try: + data = json.loads(fetch_url(url)) + results = data.get("resultList", {}).get("result", []) + if not results: + return None + + r = results[0] + sections = {} + if r.get("title"): + sections["title"] = r["title"] + if r.get("abstractText"): + sections["abstract"] = r["abstractText"] + if r.get("journalTitle"): + sections["journal"] = r["journalTitle"] + if r.get("keywordList", {}).get("keyword"): + sections["keywords"] = ", ".join(r["keywordList"]["keyword"]) + + return sections if sections else None + except Exception: + return None + + +def fetch_paper(raw_identifier: str) -> dict: + """Fetch a paper and return structured sections. + + Strategy: + 1. Resolve identifier to DOI/PMID/PMCID + 2. Try BioC full text (best for open access PMC papers) + 3. Fall back to PubMed abstract + 4. Fall back to Europe PMC abstract + """ + identifier = parse_identifier(raw_identifier) + ids = resolve_ids(identifier) + + result = { + "identifier": identifier, + "resolved_ids": ids, + "source": None, + "sections": {}, + "has_full_text": False, + "error": None, + } + + # Try BioC full text if we have a PMC ID + if ids.get("pmcid"): + sections = fetch_bioc_fulltext(ids["pmcid"]) + if sections: + result["source"] = "pmc_bioc" + result["sections"] = sections + result["has_full_text"] = True + return result + + # Try PubMed abstract + if ids.get("pmid"): + sections = fetch_pubmed_abstract(ids["pmid"]) + if sections: + result["source"] = "pubmed" + result["sections"] = sections + return result + + # Try Europe PMC + sections = fetch_europepmc_abstract(identifier) + if sections: + result["source"] = "europepmc" + result["sections"] = sections + return result + + result["error"] = f"Could not fetch paper for: {raw_identifier}" + return result + + +def main(): + parser = argparse.ArgumentParser( + description="Fetch scientific paper full text or abstract", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=__doc__, + ) + parser.add_argument("identifier", help="DOI, PMID, PMC ID, or URL") + parser.add_argument("--extract", help="Section to extract (e.g., methods, results, abstract, all)") + parser.add_argument("--query", help="Specific question — printed as reminder after the text") + parser.add_argument("--json", action="store_true", help="Output as JSON") + args = parser.parse_args() + + paper = fetch_paper(args.identifier) + + if paper["error"] and not paper["sections"]: + print(f"ERROR: {paper['error']}", file=sys.stderr) + sys.exit(1) + + sections = paper["sections"] + + if args.json: + out = {k: v[:8000] for k, v in sections.items()} + out["_source"] = paper["source"] + out["_has_full_text"] = paper["has_full_text"] + out["_resolved_ids"] = paper["resolved_ids"] + if paper["error"]: + out["_warning"] = paper["error"] + print(json.dumps(out, indent=2)) + return + + # Header + print(f"Source: {paper['source']}") + print(f"Full text: {'yes' if paper['has_full_text'] else 'no (abstract only)'}") + ids = paper["resolved_ids"] + id_strs = [f"{k}={v}" for k, v in ids.items() if v] + if id_strs: + print(f"IDs: {', '.join(id_strs)}") + print() + + if args.extract and args.extract.lower() != "all": + key = args.extract.lower().strip() + if key in sections: + print(f"=== {key.upper()} ===") + print(sections[key][:10000]) + if len(sections[key]) > 10000: + print(f"\n... [truncated, {len(sections[key])} chars total]") + else: + print(f"Section '{key}' not found.") + print(f"Available sections: {', '.join(sections.keys())}") + if "abstract" in sections: + print(f"\n=== ABSTRACT (fallback) ===") + print(sections["abstract"]) + else: + for key, text in sections.items(): + print(f"=== {key.upper()} ===") + limit = 10000 if args.extract == "all" else 3000 + print(text[:limit]) + if len(text) > limit: + print(f"... [truncated, {len(text)} chars total]") + print() + + if args.query: + print(f"\n{'='*60}") + print(f"QUERY: {args.query}") + print(f"{'='*60}") + print("(Review the text above to answer this question)") + + +if __name__ == "__main__": + main() diff --git a/src/pyflask/ai/skill_loader.py b/src/pyflask/ai/skill_loader.py new file mode 100644 index 000000000..02cbc1078 --- /dev/null +++ b/src/pyflask/ai/skill_loader.py @@ -0,0 +1,48 @@ +"""Load and expand the nwb-convert skill into a system prompt. + +Reads SKILL.md and expands `$file:` directives that include phase-specific +instructions and knowledge files. +""" + +import re +from pathlib import Path + + +def load_skill(skill_dir=None): + """Load SKILL.md and expand $file: includes, return full system prompt. + + Parameters + ---------- + skill_dir : str or Path, optional + Path to the skill directory containing SKILL.md. + Defaults to the bundled skill/ directory next to this file. + + Returns + ------- + str + The fully expanded system prompt text. + """ + if skill_dir is None: + skill_dir = Path(__file__).parent / "skill" + + skill_dir = Path(skill_dir) + skill_md = (skill_dir / "SKILL.md").read_text() + + # Strip YAML frontmatter (between --- markers) + if skill_md.startswith("---"): + parts = skill_md.split("---", 2) + if len(parts) >= 3: + skill_md = parts[2] + + # Expand $file: directives — these reference relative paths from the skill dir + def expand(match): + rel_path = match.group(1).strip() + file_path = skill_dir / rel_path + if file_path.exists(): + return file_path.read_text() + else: + return f"[WARNING: File not found: {rel_path}]" + + expanded = re.sub(r"^\$file:\s*(.+)$", expand, skill_md, flags=re.MULTILINE) + + return expanded.strip() diff --git a/src/pyflask/app.py b/src/pyflask/app.py index 00de7c4da..d1ac50320 100644 --- a/src/pyflask/app.py +++ b/src/pyflask/app.py @@ -27,6 +27,7 @@ resource_path, ) from namespaces import ( # neurosift_namespace, + ai_namespace, dandi_namespace, data_namespace, neuroconv_namespace, @@ -64,6 +65,7 @@ api.add_namespace(data_namespace) api.add_namespace(system_namespace) api.add_namespace(dandi_namespace) +api.add_namespace(ai_namespace) # api.add_namespace(neurosift_namespace) # TODO: enable later api.init_app(flask_app) diff --git a/src/pyflask/namespaces/__init__.py b/src/pyflask/namespaces/__init__.py index 0f1edb274..7ad227d45 100644 --- a/src/pyflask/namespaces/__init__.py +++ b/src/pyflask/namespaces/__init__.py @@ -1,3 +1,4 @@ +from .ai_assistant import ai_namespace from .dandi import dandi_namespace from .data import data_namespace from .neuroconv import neuroconv_namespace diff --git a/src/pyflask/namespaces/ai_assistant.py b/src/pyflask/namespaces/ai_assistant.py new file mode 100644 index 000000000..d0eb1832a --- /dev/null +++ b/src/pyflask/namespaces/ai_assistant.py @@ -0,0 +1,239 @@ +"""Flask-RESTX namespace for the AI conversion assistant. + +Provides endpoints to create agent sessions, send messages, and stream +responses via Server-Sent Events (SSE). +""" + +import json +import os +import time +from pathlib import Path + +from ai.agent import create_session, get_session, remove_session +from ai.session_store import ( + CONVERSIONS_DIR, + SESSIONS_DIR, + delete_session_record, + get_session_history, +) +from ai.session_store import list_sessions as list_saved_sessions +from flask import Response, request +from flask_restx import Namespace, Resource + +ai_namespace = Namespace("ai", description="AI conversion assistant") + + +@ai_namespace.route("/sessions") +class Sessions(Resource): + @ai_namespace.doc( + responses={200: "Success"}, + description="List all saved AI sessions.", + ) + def get(self): + """List all saved sessions (most recent first).""" + return {"sessions": list_saved_sessions()} + + @ai_namespace.doc( + responses={200: "Success", 400: "Bad Request", 500: "Internal server error"}, + description="Create a new AI agent session for NWB conversion.", + ) + def post(self): + """Create a new agent session. + + Payload: + data_dirs (list[str]): Paths to data directories to convert. + data_dir (str): Legacy single path (used if data_dirs not provided). + api_key (str, optional): Anthropic API key. + model (str, optional): Model to use. + lab_name (str, optional): Lab name for monitoring. + """ + payload = ai_namespace.payload or {} + + # Support both data_dirs (list) and legacy data_dir (string) + data_dirs = payload.get("data_dirs") or [] + if not data_dirs: + single = payload.get("data_dir") + if single: + data_dirs = [single] + + if not data_dirs: + return {"message": "At least one data directory is required"}, 400 + + for d in data_dirs: + if not os.path.isdir(d): + return {"message": f"data_dir does not exist: {d}"}, 400 + + # Derive a label from the first data directory name + label = Path(data_dirs[0]).name.replace(" ", "-").lower() + repo_name = f"{label}-to-nwb" + + # Use datetime as session ID (filesystem-safe, sortable) + from datetime import datetime, timezone + + session_id = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S") + + # Code repo lives at [NWB_GUIDE_DIR]/ai-sessions//