Files
litellm/.github/scripts/close_duplicate_issues.py
T
2026-04-17 13:02:59 -07:00

231 lines
6.9 KiB
Python
Executable File

#!/usr/bin/env python3
"""
Detect and close duplicate GitHub issues using title similarity.
Modes:
--scan Compare all open issues against each other (batch)
--issue-number N Check a single issue against older open issues
Requires the `gh` CLI to be authenticated.
"""
import argparse
import difflib
import json
import re
import subprocess
import sys
def normalize_title(title: str) -> str:
"""Strip common prefixes, lowercase, and collapse whitespace."""
title = re.sub(
r"^\[?(bug|feature request|enhancement|question|docs)[:\]]?\s*",
"",
title,
flags=re.IGNORECASE,
)
return " ".join(title.lower().split())
def gh(*args: str) -> str:
"""Run a gh CLI command and return stdout."""
result = subprocess.run(
["gh", *args],
capture_output=True,
text=True,
check=True,
)
return result.stdout
def fetch_open_issues(repo: str | None) -> list[dict]:
"""Fetch all open issues (excluding PRs) via gh api --paginate."""
if repo:
endpoint = (
f"repos/{repo}/issues?state=open&per_page=100&sort=created&direction=asc"
)
else:
endpoint = "repos/{owner}/{repo}/issues?state=open&per_page=100&sort=created&direction=asc"
cmd = ["api", "--paginate", endpoint]
raw = gh(*cmd)
# gh --paginate concatenates JSON arrays, so we may get multiple arrays
issues = []
for line in raw.strip().splitlines():
line = line.strip()
if not line:
continue
parsed = json.loads(line)
if isinstance(parsed, list):
issues.extend(parsed)
else:
issues.append(parsed)
# Filter out pull requests (they also appear in the issues endpoint)
return [i for i in issues if "pull_request" not in i]
def close_as_duplicate(
issue_number: int, duplicate_of: int, repo: str | None, dry_run: bool
) -> None:
"""Close an issue as duplicate of another, adding a comment and label."""
repo_args = ["--repo", repo] if repo else []
if dry_run:
print(
f" [DRY RUN] Would close #{issue_number} as duplicate of #{duplicate_of}"
)
return
# Add comment
comment_body = (
f"Closing as duplicate of #{duplicate_of}.\n\n"
"If you believe this is not a duplicate, please reopen and add context "
"explaining how this differs."
)
gh("issue", "comment", str(issue_number), "--body", comment_body, *repo_args)
# Add label
gh("issue", "edit", str(issue_number), "--add-label", "duplicate", *repo_args)
# Close with not_planned reason
gh(
"api",
f"repos/{repo or '{owner}/{repo}'}/issues/{issue_number}",
"-X",
"PATCH",
"-f",
"state=closed",
"-f",
"state_reason=not_planned",
)
print(f" Closed #{issue_number} as duplicate of #{duplicate_of}")
def find_duplicate(
issue: dict, candidates: list[dict], threshold: float
) -> dict | None:
"""Return the first candidate whose normalized title is above threshold."""
norm = normalize_title(issue["title"])
for candidate in candidates:
if candidate["number"] == issue["number"]:
continue
cand_norm = normalize_title(candidate["title"])
ratio = difflib.SequenceMatcher(None, norm, cand_norm).ratio()
if ratio >= threshold:
return candidate
return None
def scan_all(
issues: list[dict], threshold: float, repo: str | None, dry_run: bool
) -> int:
"""Compare every issue against all older issues. Returns count of duplicates found."""
# Sort oldest first
issues.sort(key=lambda i: i["number"])
closed_count = 0
for idx, issue in enumerate(issues):
older = issues[:idx]
if not older:
continue
dup = find_duplicate(issue, older, threshold)
if dup:
ratio = difflib.SequenceMatcher(
None,
normalize_title(issue["title"]),
normalize_title(dup["title"]),
).ratio()
print(
f"#{issue['number']}: \"{issue['title']}\"\n"
f" -> duplicate of #{dup['number']}: \"{dup['title']}\" "
f"({ratio:.0%} similar)"
)
close_as_duplicate(issue["number"], dup["number"], repo, dry_run)
closed_count += 1
return closed_count
def check_single(
issue_number: int,
issues: list[dict],
threshold: float,
repo: str | None,
dry_run: bool,
) -> bool:
"""Check a single issue against all older open issues. Returns True if duplicate found."""
target = None
for i in issues:
if i["number"] == issue_number:
target = i
break
if target is None:
print(f"Issue #{issue_number} not found among open issues.")
return False
older = [i for i in issues if i["number"] < issue_number]
dup = find_duplicate(target, older, threshold)
if dup:
ratio = difflib.SequenceMatcher(
None,
normalize_title(target["title"]),
normalize_title(dup["title"]),
).ratio()
print(
f"#{target['number']}: \"{target['title']}\"\n"
f" -> duplicate of #{dup['number']}: \"{dup['title']}\" "
f"({ratio:.0%} similar)"
)
close_as_duplicate(issue_number, dup["number"], repo, dry_run)
return True
print(f"#{issue_number}: no duplicate found above threshold {threshold}")
return False
def main() -> None:
parser = argparse.ArgumentParser(
description="Detect and close duplicate GitHub issues"
)
mode = parser.add_mutually_exclusive_group(required=True)
mode.add_argument("--scan", action="store_true", help="Scan all open issues")
mode.add_argument("--issue-number", type=int, help="Check a single issue number")
parser.add_argument(
"--threshold", type=float, default=0.85, help="Similarity threshold (0-1)"
)
parser.add_argument(
"--close",
action="store_true",
help="Actually close duplicates (default is dry-run)",
)
parser.add_argument(
"--repo", type=str, help="Repository (owner/repo). Auto-detected if omitted."
)
args = parser.parse_args()
dry_run = not args.close
if dry_run:
print("=== DRY RUN MODE (pass --close to actually close issues) ===\n")
print("Fetching open issues...")
issues = fetch_open_issues(args.repo)
print(f"Found {len(issues)} open issues.\n")
if args.scan:
count = scan_all(issues, args.threshold, args.repo, dry_run)
print(f"\nTotal duplicates {'found' if dry_run else 'closed'}: {count}")
else:
found = check_single(
args.issue_number, issues, args.threshold, args.repo, dry_run
)
sys.exit(0 if found else 0) # Always exit 0; finding no dup is not an error
if __name__ == "__main__":
main()