From 917b0de16ccff930c1ea3e2a465aed08a26d76ca Mon Sep 17 00:00:00 2001 From: Travis Vasceannie Date: Wed, 17 Sep 2025 12:39:27 +0000 Subject: [PATCH] feat: enhance internal duplicate detection by filtering common boilerplate patterns --- hooks/internal_duplicate_detector.py | 63 +++++++++++++++++++++------- 1 file changed, 47 insertions(+), 16 deletions(-) diff --git a/hooks/internal_duplicate_detector.py b/hooks/internal_duplicate_detector.py index 7de7a49..7cc06b0 100644 --- a/hooks/internal_duplicate_detector.py +++ b/hooks/internal_duplicate_detector.py @@ -13,6 +13,15 @@ from dataclasses import dataclass from typing import Any +COMMON_DUPLICATE_METHODS = { + "__init__", + "__enter__", + "__exit__", + "__aenter__", + "__aexit__", +} + + @dataclass class CodeBlock: """Represents a code block (function, method, or class).""" @@ -111,6 +120,13 @@ class InternalDuplicateDetector: pattern_groups = self._find_pattern_duplicates(blocks) duplicate_groups.extend(pattern_groups) + filtered_groups = [ + group + for group in duplicate_groups + if group.similarity_score >= self.similarity_threshold + and not self._should_ignore_group(group) + ] + results = [ { "type": group.pattern_type, @@ -125,8 +141,7 @@ class InternalDuplicateDetector: for block in group.blocks ], } - for group in duplicate_groups - if group.similarity_score >= self.similarity_threshold + for group in filtered_groups ] return { "duplicates": results, @@ -135,8 +150,7 @@ class InternalDuplicateDetector: "blocks_analyzed": len(blocks), "duplicate_lines": sum( sum(b.end_line - b.start_line + 1 for b in g.blocks) - for g in duplicate_groups - if g.similarity_score >= self.similarity_threshold + for g in filtered_groups ), }, } @@ -183,25 +197,27 @@ class InternalDuplicateDetector: complexity += len(child.values) - 1 return complexity - def extract_blocks_from_node(node: ast.AST) -> None: + def extract_blocks_from_node( + node: ast.AST, + parent: ast.AST | None = None, + ) -> None: """Recursively extract code blocks from AST nodes.""" - if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)): - if block := create_block(node, "function", lines): - blocks.append(block) - elif isinstance(node, ast.ClassDef): + if isinstance(node, ast.ClassDef): if block := create_block(node, "class", lines): blocks.append(block) - # Extract methods from class for item in node.body: - if isinstance(item, (ast.FunctionDef, ast.AsyncFunctionDef)) and ( - method_block := create_block(item, "method", lines) - ): - blocks.append(method_block) + extract_blocks_from_node(item, node) + + return + + if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)): + block_type = "method" if isinstance(parent, ast.ClassDef) else "function" + if block := create_block(node, block_type, lines): + blocks.append(block) - # Recursively visit child nodes for child in ast.iter_child_nodes(node): - extract_blocks_from_node(child) + extract_blocks_from_node(child, node) extract_blocks_from_node(tree) return blocks @@ -438,6 +454,21 @@ class InternalDuplicateDetector: # Weighted combination return 0.6 * jaccard + 0.4 * sequence_sim + def _should_ignore_group(self, group: DuplicateGroup) -> bool: + """Drop duplicate groups that match common boilerplate patterns.""" + if not group.blocks: + return False + + if all(block.name in COMMON_DUPLICATE_METHODS for block in group.blocks): + max_lines = max(block.end_line - block.start_line + 1 for block in group.blocks) + max_complexity = max(block.complexity for block in group.blocks) + + # Allow simple lifecycle dunder methods to repeat across classes. + if max_lines <= 12 and max_complexity <= 3: + return True + + return False + def detect_internal_duplicates( source_code: str,