mirror of
https://github.com/tiennm99/goclaw.git
synced 2026-06-11 14:11:29 +00:00
ace07509b7
* feat(infra): add runtime package support for skills Install nodejs, npm, pandoc, github-cli + pre-install Python packages (openpyxl, pandas, python-pptx, markitdown) and Node packages (docx, pptxgenjs). Configure runtime dirs for agent pip/npm installs with PIP_TARGET, NPM_CONFIG_PREFIX, NODE_PATH to enable dynamic package installation in read-only container environment. * feat(infra): add bundled skills with runtime package support - Add 5 bundled skills: docx, pdf, pptx, xlsx, skill-creator from container skills-store - Wire GOCLAW_BUILTIN_SKILLS_DIR env var in gateway and CLI - Support optional runtime packages alongside dynamic skill loading - Update Dockerfile to COPY bundled-skills at /app/bundled-skills/ - Add PIP_CACHE_DIR in docker-entrypoint.sh for clean pip installs - Document bundled skills in 14-skills-runtime.md section 6 * feat(infra): remove ai-multimodal skill directory from bundled skills Remove the ai-multimodal skill package as part of consolidating runtime package support for bundled skills. This directory is no longer needed in the bundled skills structure. * feat(ci): add semantic release and Docker Hub publishing Add go-semantic-release workflow to auto-create semver tags on merge to main. Extend docker-publish to push all variants to both GHCR and Docker Hub (digitop/goclaw). * feat(skills): add system skills infrastructure with is_system column, dep scanning, and seeder - Migration 000017: add is_system boolean column with partial index - Store layer: UpsertSystemSkill, delete protection, IsSystemSkill - ListAccessible auto-includes system skills (no grants needed) - ListWithGrantStatus returns is_system field - Dependency scanner: auto-detect deps from scripts/ or skill-manifest.json - Dependency checker: verify system binaries, Python/Node packages - Seeder: seed bundled skills into DB on startup (idempotent via hash) - Gateway wiring: GOCLAW_BUNDLED_SKILLS_DIR env for bundled skills - HTTP: delete guard (403), slug conflict check (409), rescan-deps endpoint - UI: System badge, hide delete for system skills, rescan deps button - Agent skills tab: "Always available" for system skills - i18n: en/vi/zh keys for system skills, deps scanning * feat(skills): conditional system prompt, skill manifests, and Zip Slip fix - System prompt: only show package list when python3/node are available - Add skill-manifest.json for pdf, docx, xlsx, pptx bundled skills - Fix Zip Slip vulnerability in office/unpack.py (all 3 copies) * refactor(skills): extract shared office code to _shared/ and deduplicate Move office scripts (pack, unpack, validate, schemas, validators) from duplicated copies in docx/xlsx/pptx to skills/_shared/office/ with symlinks. Remove soffice.py (non-functional in containers) and update SKILL.md references to use soffice binary directly. Update seeder copyDir to follow symlinks. Removes ~45K lines of duplicate code across 3 skills. * fix(skills): address code review findings for system skills integration - H1: Remove dead symlink branch in copyDir (filepath.Walk follows symlinks) - H3: Fix rescan-deps to query ALL skills (including archived) and re-activate when deps become available; add ListAllSkills() + Status field to SkillInfo - H4: Add Status field to SkillCreateParams, stop overloading Visibility - M1: Batch Python/Node dep checks into single subprocess per runtime - M4: Add rows.Err() check in ListSkills to prevent caching partial results * feat(skills): async dep checking with realtime WS events Split Seed() into sync DB upsert + async CheckDepsAsync() goroutine. Gateway startup no longer blocks on Python/Node subprocess dep checks. - Seed() returns seeded skills list, all initially status="active" - CheckDepsAsync() runs in background, emits skill.deps.checked per-skill - skill.deps.complete event emitted when all checks finish - Each failed dep check: archives skill + BumpVersion() for immediate cache invalidation so next agent turn picks up the change - UI: use-query-invalidation listens to skill.deps.* events → auto-refresh skills list in realtime * feat(skills): system skills integration with toggle, dep checking, and per-item install - Add is_system, deps, enabled columns to skills table (migration 017) - Seed bundled core skills (pdf, docx, pptx, xlsx, skill-creator) on startup - PYTHONPATH-based dep detection — eliminates false positives from local modules - Per-item dep install UI with individual status (installing/success/error) - Enable/disable toggle for core and custom skills (independent of dep status) - Re-run dep check when skill is toggled back on - Inline skill thresholds: 40 skills / 5000 tokens before switching to search mode - Fix UpsertSystemSkill: backfill null file_hash without bumping DB version - Remove redundant skill-manifest.json files (replaced by deps JSONB column) - Show author from frontmatter in custom skills tab - Runtime checker for python3/pip3/node/npm availability - WS events for dep checking/installing progress - docs: add 15-core-skills-system.md, 16-skill-publishing.md --------- Co-authored-by: Goon <duy@wearetopgroup.com>
248 lines
8.7 KiB
Python
248 lines
8.7 KiB
Python
"""
|
|
Validator for tracked changes in Word documents.
|
|
"""
|
|
|
|
import subprocess
|
|
import tempfile
|
|
import zipfile
|
|
from pathlib import Path
|
|
|
|
|
|
class RedliningValidator:
|
|
|
|
def __init__(self, unpacked_dir, original_docx, verbose=False, author="Claude"):
|
|
self.unpacked_dir = Path(unpacked_dir)
|
|
self.original_docx = Path(original_docx)
|
|
self.verbose = verbose
|
|
self.author = author
|
|
self.namespaces = {
|
|
"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main"
|
|
}
|
|
|
|
def repair(self) -> int:
|
|
return 0
|
|
|
|
def validate(self):
|
|
modified_file = self.unpacked_dir / "word" / "document.xml"
|
|
if not modified_file.exists():
|
|
print(f"FAILED - Modified document.xml not found at {modified_file}")
|
|
return False
|
|
|
|
try:
|
|
import xml.etree.ElementTree as ET
|
|
|
|
tree = ET.parse(modified_file)
|
|
root = tree.getroot()
|
|
|
|
del_elements = root.findall(".//w:del", self.namespaces)
|
|
ins_elements = root.findall(".//w:ins", self.namespaces)
|
|
|
|
author_del_elements = [
|
|
elem
|
|
for elem in del_elements
|
|
if elem.get(f"{{{self.namespaces['w']}}}author") == self.author
|
|
]
|
|
author_ins_elements = [
|
|
elem
|
|
for elem in ins_elements
|
|
if elem.get(f"{{{self.namespaces['w']}}}author") == self.author
|
|
]
|
|
|
|
if not author_del_elements and not author_ins_elements:
|
|
if self.verbose:
|
|
print(f"PASSED - No tracked changes by {self.author} found.")
|
|
return True
|
|
|
|
except Exception:
|
|
pass
|
|
|
|
with tempfile.TemporaryDirectory() as temp_dir:
|
|
temp_path = Path(temp_dir)
|
|
|
|
try:
|
|
with zipfile.ZipFile(self.original_docx, "r") as zip_ref:
|
|
zip_ref.extractall(temp_path)
|
|
except Exception as e:
|
|
print(f"FAILED - Error unpacking original docx: {e}")
|
|
return False
|
|
|
|
original_file = temp_path / "word" / "document.xml"
|
|
if not original_file.exists():
|
|
print(
|
|
f"FAILED - Original document.xml not found in {self.original_docx}"
|
|
)
|
|
return False
|
|
|
|
try:
|
|
import xml.etree.ElementTree as ET
|
|
|
|
modified_tree = ET.parse(modified_file)
|
|
modified_root = modified_tree.getroot()
|
|
original_tree = ET.parse(original_file)
|
|
original_root = original_tree.getroot()
|
|
except ET.ParseError as e:
|
|
print(f"FAILED - Error parsing XML files: {e}")
|
|
return False
|
|
|
|
self._remove_author_tracked_changes(original_root)
|
|
self._remove_author_tracked_changes(modified_root)
|
|
|
|
modified_text = self._extract_text_content(modified_root)
|
|
original_text = self._extract_text_content(original_root)
|
|
|
|
if modified_text != original_text:
|
|
error_message = self._generate_detailed_diff(
|
|
original_text, modified_text
|
|
)
|
|
print(error_message)
|
|
return False
|
|
|
|
if self.verbose:
|
|
print(f"PASSED - All changes by {self.author} are properly tracked")
|
|
return True
|
|
|
|
def _generate_detailed_diff(self, original_text, modified_text):
|
|
error_parts = [
|
|
f"FAILED - Document text doesn't match after removing {self.author}'s tracked changes",
|
|
"",
|
|
"Likely causes:",
|
|
" 1. Modified text inside another author's <w:ins> or <w:del> tags",
|
|
" 2. Made edits without proper tracked changes",
|
|
" 3. Didn't nest <w:del> inside <w:ins> when deleting another's insertion",
|
|
"",
|
|
"For pre-redlined documents, use correct patterns:",
|
|
" - To reject another's INSERTION: Nest <w:del> inside their <w:ins>",
|
|
" - To restore another's DELETION: Add new <w:ins> AFTER their <w:del>",
|
|
"",
|
|
]
|
|
|
|
git_diff = self._get_git_word_diff(original_text, modified_text)
|
|
if git_diff:
|
|
error_parts.extend(["Differences:", "============", git_diff])
|
|
else:
|
|
error_parts.append("Unable to generate word diff (git not available)")
|
|
|
|
return "\n".join(error_parts)
|
|
|
|
def _get_git_word_diff(self, original_text, modified_text):
|
|
try:
|
|
with tempfile.TemporaryDirectory() as temp_dir:
|
|
temp_path = Path(temp_dir)
|
|
|
|
original_file = temp_path / "original.txt"
|
|
modified_file = temp_path / "modified.txt"
|
|
|
|
original_file.write_text(original_text, encoding="utf-8")
|
|
modified_file.write_text(modified_text, encoding="utf-8")
|
|
|
|
result = subprocess.run(
|
|
[
|
|
"git",
|
|
"diff",
|
|
"--word-diff=plain",
|
|
"--word-diff-regex=.",
|
|
"-U0",
|
|
"--no-index",
|
|
str(original_file),
|
|
str(modified_file),
|
|
],
|
|
capture_output=True,
|
|
text=True,
|
|
)
|
|
|
|
if result.stdout.strip():
|
|
lines = result.stdout.split("\n")
|
|
content_lines = []
|
|
in_content = False
|
|
for line in lines:
|
|
if line.startswith("@@"):
|
|
in_content = True
|
|
continue
|
|
if in_content and line.strip():
|
|
content_lines.append(line)
|
|
|
|
if content_lines:
|
|
return "\n".join(content_lines)
|
|
|
|
result = subprocess.run(
|
|
[
|
|
"git",
|
|
"diff",
|
|
"--word-diff=plain",
|
|
"-U0",
|
|
"--no-index",
|
|
str(original_file),
|
|
str(modified_file),
|
|
],
|
|
capture_output=True,
|
|
text=True,
|
|
)
|
|
|
|
if result.stdout.strip():
|
|
lines = result.stdout.split("\n")
|
|
content_lines = []
|
|
in_content = False
|
|
for line in lines:
|
|
if line.startswith("@@"):
|
|
in_content = True
|
|
continue
|
|
if in_content and line.strip():
|
|
content_lines.append(line)
|
|
return "\n".join(content_lines)
|
|
|
|
except (subprocess.CalledProcessError, FileNotFoundError, Exception):
|
|
pass
|
|
|
|
return None
|
|
|
|
def _remove_author_tracked_changes(self, root):
|
|
ins_tag = f"{{{self.namespaces['w']}}}ins"
|
|
del_tag = f"{{{self.namespaces['w']}}}del"
|
|
author_attr = f"{{{self.namespaces['w']}}}author"
|
|
|
|
for parent in root.iter():
|
|
to_remove = []
|
|
for child in parent:
|
|
if child.tag == ins_tag and child.get(author_attr) == self.author:
|
|
to_remove.append(child)
|
|
for elem in to_remove:
|
|
parent.remove(elem)
|
|
|
|
deltext_tag = f"{{{self.namespaces['w']}}}delText"
|
|
t_tag = f"{{{self.namespaces['w']}}}t"
|
|
|
|
for parent in root.iter():
|
|
to_process = []
|
|
for child in parent:
|
|
if child.tag == del_tag and child.get(author_attr) == self.author:
|
|
to_process.append((child, list(parent).index(child)))
|
|
|
|
for del_elem, del_index in reversed(to_process):
|
|
for elem in del_elem.iter():
|
|
if elem.tag == deltext_tag:
|
|
elem.tag = t_tag
|
|
|
|
for child in reversed(list(del_elem)):
|
|
parent.insert(del_index, child)
|
|
parent.remove(del_elem)
|
|
|
|
def _extract_text_content(self, root):
|
|
p_tag = f"{{{self.namespaces['w']}}}p"
|
|
t_tag = f"{{{self.namespaces['w']}}}t"
|
|
|
|
paragraphs = []
|
|
for p_elem in root.findall(f".//{p_tag}"):
|
|
text_parts = []
|
|
for t_elem in p_elem.findall(f".//{t_tag}"):
|
|
if t_elem.text:
|
|
text_parts.append(t_elem.text)
|
|
paragraph_text = "".join(text_parts)
|
|
if paragraph_text:
|
|
paragraphs.append(paragraph_text)
|
|
|
|
return "\n".join(paragraphs)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
raise RuntimeError("This module should not be run directly.")
|