feat: enhance internal duplicate detection by filtering common boilerplate patterns

This commit is contained in:
2025-09-17 12:39:27 +00:00
parent 7bbbe3f5b6
commit 917b0de16c

View File

@@ -13,6 +13,15 @@ from dataclasses import dataclass
from typing import Any
COMMON_DUPLICATE_METHODS = {
"__init__",
"__enter__",
"__exit__",
"__aenter__",
"__aexit__",
}
@dataclass
class CodeBlock:
"""Represents a code block (function, method, or class)."""
@@ -111,6 +120,13 @@ class InternalDuplicateDetector:
pattern_groups = self._find_pattern_duplicates(blocks)
duplicate_groups.extend(pattern_groups)
filtered_groups = [
group
for group in duplicate_groups
if group.similarity_score >= self.similarity_threshold
and not self._should_ignore_group(group)
]
results = [
{
"type": group.pattern_type,
@@ -125,8 +141,7 @@ class InternalDuplicateDetector:
for block in group.blocks
],
}
for group in duplicate_groups
if group.similarity_score >= self.similarity_threshold
for group in filtered_groups
]
return {
"duplicates": results,
@@ -135,8 +150,7 @@ class InternalDuplicateDetector:
"blocks_analyzed": len(blocks),
"duplicate_lines": sum(
sum(b.end_line - b.start_line + 1 for b in g.blocks)
for g in duplicate_groups
if g.similarity_score >= self.similarity_threshold
for g in filtered_groups
),
},
}
@@ -183,25 +197,27 @@ class InternalDuplicateDetector:
complexity += len(child.values) - 1
return complexity
def extract_blocks_from_node(node: ast.AST) -> None:
def extract_blocks_from_node(
node: ast.AST,
parent: ast.AST | None = None,
) -> None:
"""Recursively extract code blocks from AST nodes."""
if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
if block := create_block(node, "function", lines):
blocks.append(block)
elif isinstance(node, ast.ClassDef):
if isinstance(node, ast.ClassDef):
if block := create_block(node, "class", lines):
blocks.append(block)
# Extract methods from class
for item in node.body:
if isinstance(item, (ast.FunctionDef, ast.AsyncFunctionDef)) and (
method_block := create_block(item, "method", lines)
):
blocks.append(method_block)
extract_blocks_from_node(item, node)
return
if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
block_type = "method" if isinstance(parent, ast.ClassDef) else "function"
if block := create_block(node, block_type, lines):
blocks.append(block)
# Recursively visit child nodes
for child in ast.iter_child_nodes(node):
extract_blocks_from_node(child)
extract_blocks_from_node(child, node)
extract_blocks_from_node(tree)
return blocks
@@ -438,6 +454,21 @@ class InternalDuplicateDetector:
# Weighted combination
return 0.6 * jaccard + 0.4 * sequence_sim
def _should_ignore_group(self, group: DuplicateGroup) -> bool:
"""Drop duplicate groups that match common boilerplate patterns."""
if not group.blocks:
return False
if all(block.name in COMMON_DUPLICATE_METHODS for block in group.blocks):
max_lines = max(block.end_line - block.start_line + 1 for block in group.blocks)
max_complexity = max(block.complexity for block in group.blocks)
# Allow simple lifecycle dunder methods to repeat across classes.
if max_lines <= 12 and max_complexity <= 3:
return True
return False
def detect_internal_duplicates(
source_code: str,