mirror of
https://github.com/tiennm99/goclaw.git
synced 2026-06-10 18:11:00 +00:00
ace07509b7
* feat(infra): add runtime package support for skills Install nodejs, npm, pandoc, github-cli + pre-install Python packages (openpyxl, pandas, python-pptx, markitdown) and Node packages (docx, pptxgenjs). Configure runtime dirs for agent pip/npm installs with PIP_TARGET, NPM_CONFIG_PREFIX, NODE_PATH to enable dynamic package installation in read-only container environment. * feat(infra): add bundled skills with runtime package support - Add 5 bundled skills: docx, pdf, pptx, xlsx, skill-creator from container skills-store - Wire GOCLAW_BUILTIN_SKILLS_DIR env var in gateway and CLI - Support optional runtime packages alongside dynamic skill loading - Update Dockerfile to COPY bundled-skills at /app/bundled-skills/ - Add PIP_CACHE_DIR in docker-entrypoint.sh for clean pip installs - Document bundled skills in 14-skills-runtime.md section 6 * feat(infra): remove ai-multimodal skill directory from bundled skills Remove the ai-multimodal skill package as part of consolidating runtime package support for bundled skills. This directory is no longer needed in the bundled skills structure. * feat(ci): add semantic release and Docker Hub publishing Add go-semantic-release workflow to auto-create semver tags on merge to main. Extend docker-publish to push all variants to both GHCR and Docker Hub (digitop/goclaw). * feat(skills): add system skills infrastructure with is_system column, dep scanning, and seeder - Migration 000017: add is_system boolean column with partial index - Store layer: UpsertSystemSkill, delete protection, IsSystemSkill - ListAccessible auto-includes system skills (no grants needed) - ListWithGrantStatus returns is_system field - Dependency scanner: auto-detect deps from scripts/ or skill-manifest.json - Dependency checker: verify system binaries, Python/Node packages - Seeder: seed bundled skills into DB on startup (idempotent via hash) - Gateway wiring: GOCLAW_BUNDLED_SKILLS_DIR env for bundled skills - HTTP: delete guard (403), slug conflict check (409), rescan-deps endpoint - UI: System badge, hide delete for system skills, rescan deps button - Agent skills tab: "Always available" for system skills - i18n: en/vi/zh keys for system skills, deps scanning * feat(skills): conditional system prompt, skill manifests, and Zip Slip fix - System prompt: only show package list when python3/node are available - Add skill-manifest.json for pdf, docx, xlsx, pptx bundled skills - Fix Zip Slip vulnerability in office/unpack.py (all 3 copies) * refactor(skills): extract shared office code to _shared/ and deduplicate Move office scripts (pack, unpack, validate, schemas, validators) from duplicated copies in docx/xlsx/pptx to skills/_shared/office/ with symlinks. Remove soffice.py (non-functional in containers) and update SKILL.md references to use soffice binary directly. Update seeder copyDir to follow symlinks. Removes ~45K lines of duplicate code across 3 skills. * fix(skills): address code review findings for system skills integration - H1: Remove dead symlink branch in copyDir (filepath.Walk follows symlinks) - H3: Fix rescan-deps to query ALL skills (including archived) and re-activate when deps become available; add ListAllSkills() + Status field to SkillInfo - H4: Add Status field to SkillCreateParams, stop overloading Visibility - M1: Batch Python/Node dep checks into single subprocess per runtime - M4: Add rows.Err() check in ListSkills to prevent caching partial results * feat(skills): async dep checking with realtime WS events Split Seed() into sync DB upsert + async CheckDepsAsync() goroutine. Gateway startup no longer blocks on Python/Node subprocess dep checks. - Seed() returns seeded skills list, all initially status="active" - CheckDepsAsync() runs in background, emits skill.deps.checked per-skill - skill.deps.complete event emitted when all checks finish - Each failed dep check: archives skill + BumpVersion() for immediate cache invalidation so next agent turn picks up the change - UI: use-query-invalidation listens to skill.deps.* events → auto-refresh skills list in realtime * feat(skills): system skills integration with toggle, dep checking, and per-item install - Add is_system, deps, enabled columns to skills table (migration 017) - Seed bundled core skills (pdf, docx, pptx, xlsx, skill-creator) on startup - PYTHONPATH-based dep detection — eliminates false positives from local modules - Per-item dep install UI with individual status (installing/success/error) - Enable/disable toggle for core and custom skills (independent of dep status) - Re-run dep check when skill is toggled back on - Inline skill thresholds: 40 skills / 5000 tokens before switching to search mode - Fix UpsertSystemSkill: backfill null file_hash without bumping DB version - Remove redundant skill-manifest.json files (replaced by deps JSONB column) - Show author from frontmatter in custom skills tab - Runtime checker for python3/pip3/node/npm availability - WS events for dep checking/installing progress - docs: add 15-core-skills-system.md, 16-skill-publishing.md --------- Co-authored-by: Goon <duy@wearetopgroup.com>
287 lines
9.4 KiB
Python
287 lines
9.4 KiB
Python
"""Remove unreferenced files from an unpacked PPTX directory.
|
|
|
|
Usage: python clean.py <unpacked_dir>
|
|
|
|
Example:
|
|
python clean.py unpacked/
|
|
|
|
This script removes:
|
|
- Orphaned slides (not in sldIdLst) and their relationships
|
|
- [trash] directory (unreferenced files)
|
|
- Orphaned .rels files for deleted resources
|
|
- Unreferenced media, embeddings, charts, diagrams, drawings, ink files
|
|
- Unreferenced theme files
|
|
- Unreferenced notes slides
|
|
- Content-Type overrides for deleted files
|
|
"""
|
|
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
import defusedxml.minidom
|
|
|
|
|
|
import re
|
|
|
|
|
|
def get_slides_in_sldidlst(unpacked_dir: Path) -> set[str]:
|
|
pres_path = unpacked_dir / "ppt" / "presentation.xml"
|
|
pres_rels_path = unpacked_dir / "ppt" / "_rels" / "presentation.xml.rels"
|
|
|
|
if not pres_path.exists() or not pres_rels_path.exists():
|
|
return set()
|
|
|
|
rels_dom = defusedxml.minidom.parse(str(pres_rels_path))
|
|
rid_to_slide = {}
|
|
for rel in rels_dom.getElementsByTagName("Relationship"):
|
|
rid = rel.getAttribute("Id")
|
|
target = rel.getAttribute("Target")
|
|
rel_type = rel.getAttribute("Type")
|
|
if "slide" in rel_type and target.startswith("slides/"):
|
|
rid_to_slide[rid] = target.replace("slides/", "")
|
|
|
|
pres_content = pres_path.read_text(encoding="utf-8")
|
|
referenced_rids = set(re.findall(r'<p:sldId[^>]*r:id="([^"]+)"', pres_content))
|
|
|
|
return {rid_to_slide[rid] for rid in referenced_rids if rid in rid_to_slide}
|
|
|
|
|
|
def remove_orphaned_slides(unpacked_dir: Path) -> list[str]:
|
|
slides_dir = unpacked_dir / "ppt" / "slides"
|
|
slides_rels_dir = slides_dir / "_rels"
|
|
pres_rels_path = unpacked_dir / "ppt" / "_rels" / "presentation.xml.rels"
|
|
|
|
if not slides_dir.exists():
|
|
return []
|
|
|
|
referenced_slides = get_slides_in_sldidlst(unpacked_dir)
|
|
removed = []
|
|
|
|
for slide_file in slides_dir.glob("slide*.xml"):
|
|
if slide_file.name not in referenced_slides:
|
|
rel_path = slide_file.relative_to(unpacked_dir)
|
|
slide_file.unlink()
|
|
removed.append(str(rel_path))
|
|
|
|
rels_file = slides_rels_dir / f"{slide_file.name}.rels"
|
|
if rels_file.exists():
|
|
rels_file.unlink()
|
|
removed.append(str(rels_file.relative_to(unpacked_dir)))
|
|
|
|
if removed and pres_rels_path.exists():
|
|
rels_dom = defusedxml.minidom.parse(str(pres_rels_path))
|
|
changed = False
|
|
|
|
for rel in list(rels_dom.getElementsByTagName("Relationship")):
|
|
target = rel.getAttribute("Target")
|
|
if target.startswith("slides/"):
|
|
slide_name = target.replace("slides/", "")
|
|
if slide_name not in referenced_slides:
|
|
if rel.parentNode:
|
|
rel.parentNode.removeChild(rel)
|
|
changed = True
|
|
|
|
if changed:
|
|
with open(pres_rels_path, "wb") as f:
|
|
f.write(rels_dom.toxml(encoding="utf-8"))
|
|
|
|
return removed
|
|
|
|
|
|
def remove_trash_directory(unpacked_dir: Path) -> list[str]:
|
|
trash_dir = unpacked_dir / "[trash]"
|
|
removed = []
|
|
|
|
if trash_dir.exists() and trash_dir.is_dir():
|
|
for file_path in trash_dir.iterdir():
|
|
if file_path.is_file():
|
|
rel_path = file_path.relative_to(unpacked_dir)
|
|
removed.append(str(rel_path))
|
|
file_path.unlink()
|
|
trash_dir.rmdir()
|
|
|
|
return removed
|
|
|
|
|
|
def get_slide_referenced_files(unpacked_dir: Path) -> set:
|
|
referenced = set()
|
|
slides_rels_dir = unpacked_dir / "ppt" / "slides" / "_rels"
|
|
|
|
if not slides_rels_dir.exists():
|
|
return referenced
|
|
|
|
for rels_file in slides_rels_dir.glob("*.rels"):
|
|
dom = defusedxml.minidom.parse(str(rels_file))
|
|
for rel in dom.getElementsByTagName("Relationship"):
|
|
target = rel.getAttribute("Target")
|
|
if not target:
|
|
continue
|
|
target_path = (rels_file.parent.parent / target).resolve()
|
|
try:
|
|
referenced.add(target_path.relative_to(unpacked_dir.resolve()))
|
|
except ValueError:
|
|
pass
|
|
|
|
return referenced
|
|
|
|
|
|
def remove_orphaned_rels_files(unpacked_dir: Path) -> list[str]:
|
|
resource_dirs = ["charts", "diagrams", "drawings"]
|
|
removed = []
|
|
slide_referenced = get_slide_referenced_files(unpacked_dir)
|
|
|
|
for dir_name in resource_dirs:
|
|
rels_dir = unpacked_dir / "ppt" / dir_name / "_rels"
|
|
if not rels_dir.exists():
|
|
continue
|
|
|
|
for rels_file in rels_dir.glob("*.rels"):
|
|
resource_file = rels_dir.parent / rels_file.name.replace(".rels", "")
|
|
try:
|
|
resource_rel_path = resource_file.resolve().relative_to(unpacked_dir.resolve())
|
|
except ValueError:
|
|
continue
|
|
|
|
if not resource_file.exists() or resource_rel_path not in slide_referenced:
|
|
rels_file.unlink()
|
|
rel_path = rels_file.relative_to(unpacked_dir)
|
|
removed.append(str(rel_path))
|
|
|
|
return removed
|
|
|
|
|
|
def get_referenced_files(unpacked_dir: Path) -> set:
|
|
referenced = set()
|
|
|
|
for rels_file in unpacked_dir.rglob("*.rels"):
|
|
dom = defusedxml.minidom.parse(str(rels_file))
|
|
for rel in dom.getElementsByTagName("Relationship"):
|
|
target = rel.getAttribute("Target")
|
|
if not target:
|
|
continue
|
|
target_path = (rels_file.parent.parent / target).resolve()
|
|
try:
|
|
referenced.add(target_path.relative_to(unpacked_dir.resolve()))
|
|
except ValueError:
|
|
pass
|
|
|
|
return referenced
|
|
|
|
|
|
def remove_orphaned_files(unpacked_dir: Path, referenced: set) -> list[str]:
|
|
resource_dirs = ["media", "embeddings", "charts", "diagrams", "tags", "drawings", "ink"]
|
|
removed = []
|
|
|
|
for dir_name in resource_dirs:
|
|
dir_path = unpacked_dir / "ppt" / dir_name
|
|
if not dir_path.exists():
|
|
continue
|
|
|
|
for file_path in dir_path.glob("*"):
|
|
if not file_path.is_file():
|
|
continue
|
|
rel_path = file_path.relative_to(unpacked_dir)
|
|
if rel_path not in referenced:
|
|
file_path.unlink()
|
|
removed.append(str(rel_path))
|
|
|
|
theme_dir = unpacked_dir / "ppt" / "theme"
|
|
if theme_dir.exists():
|
|
for file_path in theme_dir.glob("theme*.xml"):
|
|
rel_path = file_path.relative_to(unpacked_dir)
|
|
if rel_path not in referenced:
|
|
file_path.unlink()
|
|
removed.append(str(rel_path))
|
|
theme_rels = theme_dir / "_rels" / f"{file_path.name}.rels"
|
|
if theme_rels.exists():
|
|
theme_rels.unlink()
|
|
removed.append(str(theme_rels.relative_to(unpacked_dir)))
|
|
|
|
notes_dir = unpacked_dir / "ppt" / "notesSlides"
|
|
if notes_dir.exists():
|
|
for file_path in notes_dir.glob("*.xml"):
|
|
if not file_path.is_file():
|
|
continue
|
|
rel_path = file_path.relative_to(unpacked_dir)
|
|
if rel_path not in referenced:
|
|
file_path.unlink()
|
|
removed.append(str(rel_path))
|
|
|
|
notes_rels_dir = notes_dir / "_rels"
|
|
if notes_rels_dir.exists():
|
|
for file_path in notes_rels_dir.glob("*.rels"):
|
|
notes_file = notes_dir / file_path.name.replace(".rels", "")
|
|
if not notes_file.exists():
|
|
file_path.unlink()
|
|
removed.append(str(file_path.relative_to(unpacked_dir)))
|
|
|
|
return removed
|
|
|
|
|
|
def update_content_types(unpacked_dir: Path, removed_files: list[str]) -> None:
|
|
ct_path = unpacked_dir / "[Content_Types].xml"
|
|
if not ct_path.exists():
|
|
return
|
|
|
|
dom = defusedxml.minidom.parse(str(ct_path))
|
|
changed = False
|
|
|
|
for override in list(dom.getElementsByTagName("Override")):
|
|
part_name = override.getAttribute("PartName").lstrip("/")
|
|
if part_name in removed_files:
|
|
if override.parentNode:
|
|
override.parentNode.removeChild(override)
|
|
changed = True
|
|
|
|
if changed:
|
|
with open(ct_path, "wb") as f:
|
|
f.write(dom.toxml(encoding="utf-8"))
|
|
|
|
|
|
def clean_unused_files(unpacked_dir: Path) -> list[str]:
|
|
all_removed = []
|
|
|
|
slides_removed = remove_orphaned_slides(unpacked_dir)
|
|
all_removed.extend(slides_removed)
|
|
|
|
trash_removed = remove_trash_directory(unpacked_dir)
|
|
all_removed.extend(trash_removed)
|
|
|
|
while True:
|
|
removed_rels = remove_orphaned_rels_files(unpacked_dir)
|
|
referenced = get_referenced_files(unpacked_dir)
|
|
removed_files = remove_orphaned_files(unpacked_dir, referenced)
|
|
|
|
total_removed = removed_rels + removed_files
|
|
if not total_removed:
|
|
break
|
|
|
|
all_removed.extend(total_removed)
|
|
|
|
if all_removed:
|
|
update_content_types(unpacked_dir, all_removed)
|
|
|
|
return all_removed
|
|
|
|
|
|
if __name__ == "__main__":
|
|
if len(sys.argv) != 2:
|
|
print("Usage: python clean.py <unpacked_dir>", file=sys.stderr)
|
|
print("Example: python clean.py unpacked/", file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
unpacked_dir = Path(sys.argv[1])
|
|
|
|
if not unpacked_dir.exists():
|
|
print(f"Error: {unpacked_dir} not found", file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
removed = clean_unused_files(unpacked_dir)
|
|
|
|
if removed:
|
|
print(f"Removed {len(removed)} unreferenced files:")
|
|
for f in removed:
|
|
print(f" {f}")
|
|
else:
|
|
print("No unreferenced files found")
|