feat: enhance internal duplicate detection by filtering common boilerplate patterns
This commit is contained in:
@@ -13,6 +13,15 @@ from dataclasses import dataclass
|
||||
from typing import Any
|
||||
|
||||
|
||||
COMMON_DUPLICATE_METHODS = {
|
||||
"__init__",
|
||||
"__enter__",
|
||||
"__exit__",
|
||||
"__aenter__",
|
||||
"__aexit__",
|
||||
}
|
||||
|
||||
|
||||
@dataclass
|
||||
class CodeBlock:
|
||||
"""Represents a code block (function, method, or class)."""
|
||||
@@ -111,6 +120,13 @@ class InternalDuplicateDetector:
|
||||
pattern_groups = self._find_pattern_duplicates(blocks)
|
||||
duplicate_groups.extend(pattern_groups)
|
||||
|
||||
filtered_groups = [
|
||||
group
|
||||
for group in duplicate_groups
|
||||
if group.similarity_score >= self.similarity_threshold
|
||||
and not self._should_ignore_group(group)
|
||||
]
|
||||
|
||||
results = [
|
||||
{
|
||||
"type": group.pattern_type,
|
||||
@@ -125,8 +141,7 @@ class InternalDuplicateDetector:
|
||||
for block in group.blocks
|
||||
],
|
||||
}
|
||||
for group in duplicate_groups
|
||||
if group.similarity_score >= self.similarity_threshold
|
||||
for group in filtered_groups
|
||||
]
|
||||
return {
|
||||
"duplicates": results,
|
||||
@@ -135,8 +150,7 @@ class InternalDuplicateDetector:
|
||||
"blocks_analyzed": len(blocks),
|
||||
"duplicate_lines": sum(
|
||||
sum(b.end_line - b.start_line + 1 for b in g.blocks)
|
||||
for g in duplicate_groups
|
||||
if g.similarity_score >= self.similarity_threshold
|
||||
for g in filtered_groups
|
||||
),
|
||||
},
|
||||
}
|
||||
@@ -183,25 +197,27 @@ class InternalDuplicateDetector:
|
||||
complexity += len(child.values) - 1
|
||||
return complexity
|
||||
|
||||
def extract_blocks_from_node(node: ast.AST) -> None:
|
||||
def extract_blocks_from_node(
|
||||
node: ast.AST,
|
||||
parent: ast.AST | None = None,
|
||||
) -> None:
|
||||
"""Recursively extract code blocks from AST nodes."""
|
||||
if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
|
||||
if block := create_block(node, "function", lines):
|
||||
blocks.append(block)
|
||||
elif isinstance(node, ast.ClassDef):
|
||||
if isinstance(node, ast.ClassDef):
|
||||
if block := create_block(node, "class", lines):
|
||||
blocks.append(block)
|
||||
|
||||
# Extract methods from class
|
||||
for item in node.body:
|
||||
if isinstance(item, (ast.FunctionDef, ast.AsyncFunctionDef)) and (
|
||||
method_block := create_block(item, "method", lines)
|
||||
):
|
||||
blocks.append(method_block)
|
||||
extract_blocks_from_node(item, node)
|
||||
|
||||
return
|
||||
|
||||
if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
|
||||
block_type = "method" if isinstance(parent, ast.ClassDef) else "function"
|
||||
if block := create_block(node, block_type, lines):
|
||||
blocks.append(block)
|
||||
|
||||
# Recursively visit child nodes
|
||||
for child in ast.iter_child_nodes(node):
|
||||
extract_blocks_from_node(child)
|
||||
extract_blocks_from_node(child, node)
|
||||
|
||||
extract_blocks_from_node(tree)
|
||||
return blocks
|
||||
@@ -438,6 +454,21 @@ class InternalDuplicateDetector:
|
||||
# Weighted combination
|
||||
return 0.6 * jaccard + 0.4 * sequence_sim
|
||||
|
||||
def _should_ignore_group(self, group: DuplicateGroup) -> bool:
|
||||
"""Drop duplicate groups that match common boilerplate patterns."""
|
||||
if not group.blocks:
|
||||
return False
|
||||
|
||||
if all(block.name in COMMON_DUPLICATE_METHODS for block in group.blocks):
|
||||
max_lines = max(block.end_line - block.start_line + 1 for block in group.blocks)
|
||||
max_complexity = max(block.complexity for block in group.blocks)
|
||||
|
||||
# Allow simple lifecycle dunder methods to repeat across classes.
|
||||
if max_lines <= 12 and max_complexity <= 3:
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
|
||||
def detect_internal_duplicates(
|
||||
source_code: str,
|
||||
|
||||
Reference in New Issue
Block a user