feat: implement code quality analysis toolkit with modernization, complexity and duplication detection

This commit is contained in:
2025-08-26 12:23:57 -04:00
parent 530c49accd
commit 0475c3cae6
34 changed files with 7273 additions and 0 deletions

161
.gitignore vendored Normal file
View File

@@ -0,0 +1,161 @@
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
.hypothesis/
.pytest_cache/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
target/
# Jupyter Notebook
.ipynb_checkpoints
# pyenv
.python-version
# celery beat schedule file
celerybeat-schedule
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# IDE specific files
.vscode/
.idea/
*.swp
*.swo
*~
# OS specific files
.DS_Store
.DS_Store?
._*
.Spotlight-V100
.Trashes
ehthumbs.db
Thumbs.db
# Temporary files
*.tmp
*.temp
*.bak
*.backup
# Log files
*.log
# Database files
*.db
*.sqlite
*.sqlite3
# Configuration files with secrets
.env.local
.env.production
config.local.yaml
secrets.yaml
# UV specific
.uv_cache/
# Ruff cache
.ruff_cache/
# Test artifacts
.coverage
.pytest_cache/
htmlcov/
# Build artifacts
dist/
build/
*.egg-info/

21
LICENSE Normal file
View File

@@ -0,0 +1,21 @@
MIT License
Copyright (c) 2024 Your Name
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

61
pyproject.toml Normal file
View File

@@ -0,0 +1,61 @@
[build-system]
requires = ["hatchling"]
build-backend = "hatchling.build"
[project]
name = "claude-scripts"
version = "0.1.0"
description = "A comprehensive Python code quality analysis toolkit for detecting duplicates, complexity metrics, and modernization opportunities"
authors = [{name = "Your Name", email = "your.email@example.com"}]
readme = "README.md"
license = {file = "LICENSE"}
requires-python = ">=3.12"
classifiers = [
"Development Status :: 4 - Beta",
"Intended Audience :: Developers",
"License :: OSI Approved :: MIT License",
"Operating System :: OS Independent",
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3.12",
"Programming Language :: Python :: 3.13",
"Topic :: Software Development :: Quality Assurance",
"Topic :: Software Development :: Libraries :: Python Modules",
]
keywords = ["code-quality", "static-analysis", "duplicate-detection", "complexity", "refactoring"]
dependencies = [
"click>=8.0.0",
"pyyaml>=6.0",
"pydantic>=2.0.0",
"radon>=6.0.0",
]
[project.optional-dependencies]
dev = [
"pytest>=7.0.0",
"pytest-cov>=4.0.0",
"ruff>=0.1.0",
"mypy>=1.5.0",
"pre-commit>=3.0.0",
]
[project.urls]
Homepage = "https://github.com/yourusername/claude-scripts"
Repository = "https://github.com/yourusername/claude-scripts"
Issues = "https://github.com/yourusername/claude-scripts/issues"
Documentation = "https://github.com/yourusername/claude-scripts#readme"
[project.scripts]
claude-quality = "quality.cli.main:cli"
[tool.hatch.build.targets.sdist]
exclude = [
"/.github",
"/docs",
"/.vscode",
"/.pytest_cache",
"/.mypy_cache",
"/.ruff_cache",
]
[tool.hatch.build.targets.wheel]
packages = ["src/quality"]

View File

@@ -0,0 +1,131 @@
# Quality Analysis Exceptions Configuration
# This file allows you to suppress specific analysis results based on file patterns,
# line patterns, issue types, and other criteria.
exceptions:
enabled: true
# Global file and directory exclusions
# These patterns will suppress all issues for matching files/directories
exclude_files:
- "*/tests/*"
- "*/test_*"
- "*/__pycache__/*"
- "*/migrations/*"
- "*/conftest.py"
- "*/.pytest_cache/*"
exclude_directories:
- "*/venv/*"
- "*/.venv/*"
- "*/node_modules/*"
- "*/.git/*"
- "*/build/*"
- "*/dist/*"
# Specific exception rules
rules:
# Example: Suppress complexity issues in legacy code
- analysis_type: "complexity"
issue_type: "high_complexity"
file_patterns:
- "*/legacy/*"
- "*/third_party/*"
- "*/vendor/*"
reason: "Legacy code with known complexity - migration planned"
# Example: Allow intentional Pydantic v1 usage in compatibility layers
- analysis_type: "modernization"
issue_type: "pydantic_v1_pattern"
file_patterns:
- "*/compatibility/*"
- "*/adapters/*"
line_patterns:
- "# pydantic v1 required"
- "# TODO: migrate to v2"
- "# legacy compatibility"
reason: "Intentional Pydantic v1 usage for compatibility with legacy systems"
# Example: Suppress typing imports for external compatibility
- analysis_type: "modernization"
issue_type: "legacy_typing_import"
file_patterns:
- "*/external/*"
- "*/integrations/*"
reason: "External library compatibility requirements"
# Example: Allow duplicates in generated/template code
- analysis_type: "duplicates"
file_patterns:
- "*/templates/*"
- "*/generated/*"
- "*/auto_generated/*"
- "*/schemas/auto/*"
reason: "Generated or template code - duplication expected and acceptable"
# Example: Suppress modernization issues in scripts
- analysis_type: "modernization"
file_patterns:
- "*/scripts/*"
- "*/migrations/*"
reason: "Scripts and migrations prioritize backward compatibility"
# Example: Temporary suppression with expiration for gradual refactoring
- analysis_type: "complexity"
issue_type: "high_complexity"
file_patterns:
- "*/parsers/*"
- "*/processors/*"
reason: "Complex parsing logic - refactoring scheduled for Q2 2024"
expires: "2024-06-30"
enabled: true
# Example: Suppress specific modernization patterns in test files
- analysis_type: "modernization"
issue_type: "legacy_typing_import"
file_patterns:
- "**/test_*.py"
- "**/tests/*.py"
reason: "Tests may use legacy patterns for compatibility testing"
# Example: Allow specific duplicates in configuration files
- analysis_type: "duplicates"
file_patterns:
- "*/config/*"
- "*/settings/*"
line_patterns:
- "# duplicate config acceptable"
- "# shared configuration"
reason: "Configuration files may have intentional duplication"
# Analysis Types Available:
# - "complexity" - Code complexity issues (high cyclomatic/cognitive complexity)
# - "duplicates" - Duplicate code detection
# - "modernization" - Modern Python pattern suggestions
# - "code_smells" - General code smell detection (if implemented)
# Common Issue Types:
# Complexity:
# - "high_complexity" - General high complexity
# - "cyclomatic_complexity" - High cyclomatic complexity
# - "cognitive_complexity" - High cognitive complexity
#
# Modernization:
# - "legacy_typing_import" - from typing import List, Dict, etc.
# - "pydantic_v1_pattern" - Pydantic v1 usage patterns
# - "old_string_formatting" - % string formatting
# - "format_to_fstring" - .format() that could be f-strings
# - "unnecessary_object_inheritance" - class Foo(object):
#
# Duplicates:
# - "duplicate_code" - General duplicate code blocks
# Pattern Syntax:
# - file_patterns: Unix shell-style wildcards (*, **, ?, [seq])
# - line_patterns: Python regex patterns
# - Use "*" for analysis_type to match all analysis types
# - Leave issue_type empty to match all issues of that analysis type
# Expiration Format:
# - expires: "YYYY-MM-DD" format
# - Rules with past expiration dates are automatically disabled

9
src/quality/__init__.py Normal file
View File

@@ -0,0 +1,9 @@
"""Enhanced code quality analysis package."""
__version__ = "1.0.0"
__author__ = "IntelliKit Team"
__email__ = "team@intellikit.com"
# Minimal imports to prevent pre-commit failures
# Full imports can be added later when all modules are properly set up
__all__ = []

View File

@@ -0,0 +1 @@
"""Code analyzers for various quality checks."""

View File

@@ -0,0 +1,831 @@
"""Modern Python patterns analyzer."""
import ast
import re
from dataclasses import dataclass
from pathlib import Path
from typing import Any
from ..config.schemas import QualityConfig
from ..core.exceptions import ExceptionFilter
@dataclass
class ModernizationIssue:
"""Represents a modernization issue in code."""
file_path: str
line_number: int
column: int
issue_type: str
old_pattern: str
suggested_fix: str
severity: str # "error", "warning", "info"
description: str
can_auto_fix: bool = True
class ModernizationAnalyzer(ast.NodeVisitor):
"""Analyzes code for non-modern Python patterns."""
# Types that can be replaced with modern built-in equivalents
REPLACEABLE_TYPING_IMPORTS = {
"List",
"Dict",
"Tuple",
"Set",
"FrozenSet",
"Union",
"Optional",
}
# Types that moved to collections module but are replaceable
COLLECTIONS_TYPING_IMPORTS = {
"Deque",
"Counter",
"DefaultDict",
"ChainMap",
"OrderedDict",
}
# Types that moved to other modules
MOVED_TYPING_IMPORTS = {
"Callable": "collections.abc",
"Coroutine": "collections.abc",
"Awaitable": "collections.abc",
"AsyncIterable": "collections.abc",
"AsyncIterator": "collections.abc",
"Iterable": "collections.abc",
"Iterator": "collections.abc",
"Generator": "collections.abc",
"Hashable": "collections.abc",
"Reversible": "collections.abc",
"Container": "collections.abc",
"Collection": "collections.abc",
"Sequence": "collections.abc",
"MutableSequence": "collections.abc",
"Set": "collections.abc",
"MutableSet": "collections.abc",
"Mapping": "collections.abc",
"MutableMapping": "collections.abc",
"Sized": "collections.abc",
"Pattern": "re",
"Match": "re",
}
# Types that must remain in typing module (no modern replacement)
REQUIRED_TYPING_IMPORTS = {
"ClassVar",
"TypeVar",
"Generic",
"Protocol",
"Final",
"Literal",
"Type",
"TypedDict",
"NewType",
"NoReturn",
"Never",
"Self",
"Unpack",
"TypeAlias",
"TypeGuard",
"TypeIs",
"Annotated",
"Any",
"overload",
"runtime_checkable",
"TYPE_CHECKING",
}
# Combined set of all recognized typing imports
ALL_TYPING_IMPORTS = (
REPLACEABLE_TYPING_IMPORTS
| COLLECTIONS_TYPING_IMPORTS
| set(MOVED_TYPING_IMPORTS.keys())
| REQUIRED_TYPING_IMPORTS
)
# Mapping for truly replaceable types
REPLACEABLE_TO_MODERN = {
"List": "list",
"Dict": "dict",
"Tuple": "tuple",
"Set": "set",
"FrozenSet": "frozenset",
"Union": "|",
"Optional": "| None",
}
# Mapping for collections types
COLLECTIONS_TO_MODERN = {
"Deque": "collections.deque",
"Counter": "collections.Counter",
"DefaultDict": "collections.defaultdict",
"ChainMap": "collections.ChainMap",
"OrderedDict": "collections.OrderedDict",
}
def __init__(
self, file_path: str, content: str, config: QualityConfig | None = None
):
self.file_path = file_path
self.content = content
self.content_lines = content.splitlines()
self.config = config or QualityConfig()
self.issues: list[ModernizationIssue] = []
self.imports: dict[str, str] = {} # name -> module
self.typing_imports: set[str] = set()
self.has_future_annotations = False
def analyze(self) -> list[ModernizationIssue]:
"""Run the modernization analysis."""
try:
tree = ast.parse(self.content)
self.visit(tree)
# Additional pattern-based checks
self._check_string_patterns()
self._check_exception_patterns()
self._check_super_patterns()
except SyntaxError:
pass # Skip files with syntax errors
return self.issues
def visit_ImportFrom(self, node: ast.ImportFrom) -> None:
"""Check for typing imports that can be modernized."""
if node.module == "typing":
for alias in node.names:
name = alias.asname or alias.name
if alias.name in self.ALL_TYPING_IMPORTS:
self.typing_imports.add(name)
# Only flag imports that can be modernized
if alias.name in self.REPLACEABLE_TYPING_IMPORTS:
self._add_replaceable_typing_import_issue(node, alias.name, name)
elif alias.name in self.COLLECTIONS_TYPING_IMPORTS:
self._add_collections_typing_import_issue(node, alias.name, name)
elif alias.name in self.MOVED_TYPING_IMPORTS:
self._add_moved_typing_import_issue(node, alias.name, name)
# Note: REQUIRED_TYPING_IMPORTS are not flagged as issues
elif node.module == "__future__" and any(
alias.name == "annotations" for alias in node.names
):
self.has_future_annotations = True
# Track all imports for context
if node.module:
for alias in node.names:
name = alias.asname or alias.name
if name is not None and node.module is not None:
self.imports[name] = node.module
self.generic_visit(node)
def visit_Import(self, node: ast.Import) -> None:
"""Track regular imports."""
for alias in node.names:
name = alias.asname or alias.name
self.imports[name] = alias.name
self.generic_visit(node)
def visit_Subscript(self, node: ast.Subscript) -> None:
"""Check for typing usage in type annotations that can be modernized."""
typing_name = None
if isinstance(node.value, ast.Name) and node.value.id in self.typing_imports:
typing_name = node.value.id
elif (
isinstance(node.value, ast.Attribute)
and isinstance(node.value.value, ast.Name)
and node.value.value.id == "typing"
and node.value.attr in self.ALL_TYPING_IMPORTS
):
# Handle typing.List, typing.Dict etc.
typing_name = node.value.attr
if typing_name:
# Only flag usage of types that can be modernized
if typing_name in (
self.REPLACEABLE_TYPING_IMPORTS | self.COLLECTIONS_TYPING_IMPORTS
):
self._add_typing_usage_issue(node, typing_name)
elif typing_name in self.MOVED_TYPING_IMPORTS:
self._add_moved_typing_usage_issue(node, typing_name)
# Note: REQUIRED_TYPING_IMPORTS usage is not flagged
self.generic_visit(node)
def visit_BinOp(self, node: ast.BinOp) -> None:
"""Check for Union usage that could be modernized."""
if isinstance(node.op, ast.BitOr):
# This is already modern syntax (X | Y)
pass
self.generic_visit(node)
def visit_FunctionDef(self, node: ast.FunctionDef) -> None:
"""Check function definitions for modernization opportunities."""
# Check for missing return type annotations
if not node.returns and not self._is_dunder_method(node.name):
self._add_missing_return_type_issue(node)
# Check for untyped parameters
for arg in node.args.args:
if not arg.annotation and arg.arg != "self" and arg.arg != "cls":
self._add_missing_param_type_issue(node, arg)
self.generic_visit(node)
def visit_AsyncFunctionDef(self, node: ast.AsyncFunctionDef) -> None:
"""Check async function definitions."""
# Same checks as regular functions
if not node.returns and not self._is_dunder_method(node.name):
self._add_missing_return_type_issue(node)
for arg in node.args.args:
if not arg.annotation and arg.arg != "self" and arg.arg != "cls":
self._add_missing_param_type_issue(node, arg)
self.generic_visit(node)
def visit_ClassDef(self, node: ast.ClassDef) -> None:
"""Check class definitions for modernization opportunities."""
# Check if class inherits from object (unnecessary in Python 3)
for base in node.bases:
if isinstance(base, ast.Name) and base.id == "object":
self._add_unnecessary_object_inheritance_issue(node, base)
self.generic_visit(node)
def _add_replaceable_typing_import_issue(
self, node: ast.ImportFrom, typing_name: str, import_name: str
) -> None:
"""Add issue for typing import that can be replaced with built-ins."""
modern_replacement = self.REPLACEABLE_TO_MODERN[typing_name]
if typing_name in ["List", "Dict", "Tuple", "Set", "FrozenSet"]:
description = f"Use built-in '{modern_replacement}' instead of 'typing.{typing_name}' (Python 3.9+)"
severity = "warning"
elif typing_name == "Union":
description = (
"Use '|' union operator instead of 'typing.Union' (Python 3.10+)"
)
severity = "warning"
elif typing_name == "Optional":
description = "Use '| None' instead of 'typing.Optional' (Python 3.10+)"
severity = "warning"
else:
description = (
f"Use '{modern_replacement}' instead of 'typing.{typing_name}'"
)
severity = "warning"
self.issues.append(
ModernizationIssue(
file_path=self.file_path,
line_number=node.lineno,
column=node.col_offset,
issue_type="replaceable_typing_import",
old_pattern=f"from typing import {typing_name}",
suggested_fix=f"# Remove this import and use {modern_replacement} directly",
severity=severity,
description=description,
)
)
def _add_collections_typing_import_issue(
self, node: ast.ImportFrom, typing_name: str, import_name: str
) -> None:
"""Add issue for typing import that moved to collections."""
self.issues.append(
ModernizationIssue(
file_path=self.file_path,
line_number=node.lineno,
column=node.col_offset,
issue_type="collections_typing_import",
old_pattern=f"from typing import {typing_name}",
suggested_fix=f"from collections import {typing_name.lower()}",
severity="info",
description=f"Use 'from collections import {typing_name.lower()}' instead of 'typing.{typing_name}'",
)
)
def _add_moved_typing_import_issue(
self, node: ast.ImportFrom, typing_name: str, import_name: str
) -> None:
"""Add issue for typing import that moved to another module."""
target_module = self.MOVED_TYPING_IMPORTS[typing_name]
self.issues.append(
ModernizationIssue(
file_path=self.file_path,
line_number=node.lineno,
column=node.col_offset,
issue_type="moved_typing_import",
old_pattern=f"from typing import {typing_name}",
suggested_fix=f"from {target_module} import {typing_name}",
severity="info",
description=f"'{typing_name}' moved from 'typing' to '{target_module}' module",
)
)
def _add_typing_usage_issue(self, node: ast.Subscript, typing_name: str) -> None:
"""Add issue for typing usage that can be modernized."""
if typing_name in self.REPLACEABLE_TYPING_IMPORTS:
modern_replacement = self.REPLACEABLE_TO_MODERN[typing_name]
if typing_name in ["List", "Dict", "Tuple", "Set", "FrozenSet"]:
old_pattern = f"{typing_name}[...]"
new_pattern = f"{modern_replacement.lower()}[...]"
description = f"Use built-in '{modern_replacement}' instead of 'typing.{typing_name}'"
severity = "warning"
elif typing_name == "Union":
old_pattern = "Union[...]"
new_pattern = "... | ..."
description = "Use '|' union operator instead of 'typing.Union'"
severity = "warning"
elif typing_name == "Optional":
old_pattern = "Optional[...]"
new_pattern = "... | None"
description = "Use '| None' instead of 'typing.Optional'"
severity = "warning"
else:
return # Skip unknown replaceable types
elif typing_name in self.COLLECTIONS_TYPING_IMPORTS:
modern_replacement = self.COLLECTIONS_TO_MODERN[typing_name]
old_pattern = f"{typing_name}[...]"
new_pattern = f"{modern_replacement}[...]"
description = (
f"Use '{modern_replacement}' instead of 'typing.{typing_name}'"
)
severity = "info"
else:
return # Skip unknown types
self.issues.append(
ModernizationIssue(
file_path=self.file_path,
line_number=node.lineno,
column=node.col_offset,
issue_type="typing_usage",
old_pattern=old_pattern,
suggested_fix=new_pattern,
severity=severity,
description=description,
)
)
def _add_moved_typing_usage_issue(
self, node: ast.Subscript, typing_name: str
) -> None:
"""Add issue for typing usage that moved to another module."""
target_module = self.MOVED_TYPING_IMPORTS[typing_name]
self.issues.append(
ModernizationIssue(
file_path=self.file_path,
line_number=node.lineno,
column=node.col_offset,
issue_type="moved_typing_usage",
old_pattern=f"typing.{typing_name}[...]",
suggested_fix=f"{target_module}.{typing_name}[...]",
severity="info",
description=f"Use '{target_module}.{typing_name}' instead of 'typing.{typing_name}'",
)
)
def _add_missing_return_type_issue(
self, node: ast.FunctionDef | ast.AsyncFunctionDef
) -> None:
"""Add issue for missing return type annotation."""
self.issues.append(
ModernizationIssue(
file_path=self.file_path,
line_number=node.lineno,
column=node.col_offset,
issue_type="missing_return_type",
old_pattern=f"def {node.name}(...)",
suggested_fix=f"def {node.name}(...) -> ReturnType",
severity="info",
description="Consider adding return type annotation for better type safety",
can_auto_fix=False,
)
)
def _add_missing_param_type_issue(
self, node: ast.FunctionDef | ast.AsyncFunctionDef, arg: ast.arg
) -> None:
"""Add issue for missing parameter type annotation."""
self.issues.append(
ModernizationIssue(
file_path=self.file_path,
line_number=node.lineno,
column=node.col_offset,
issue_type="missing_param_type",
old_pattern=f"{arg.arg}",
suggested_fix=f"{arg.arg}: ParamType",
severity="info",
description=f"Consider adding type annotation for parameter '{arg.arg}'",
can_auto_fix=False,
)
)
def _add_unnecessary_object_inheritance_issue(
self, node: ast.ClassDef, base: ast.Name
) -> None:
"""Add issue for unnecessary object inheritance."""
self.issues.append(
ModernizationIssue(
file_path=self.file_path,
line_number=node.lineno,
column=node.col_offset,
issue_type="unnecessary_object_inheritance",
old_pattern=f"class {node.name}(object)",
suggested_fix=f"class {node.name}",
severity="info",
description="Inheriting from 'object' is unnecessary in Python 3",
can_auto_fix=True,
)
)
def _check_string_patterns(self) -> None:
"""Check for old-style string formatting."""
for i, line in enumerate(self.content_lines, 1):
# Check for % formatting
if re.search(r'["\'].*%[sd].*["\'].*%', line):
self.issues.append(
ModernizationIssue(
file_path=self.file_path,
line_number=i,
column=0,
issue_type="old_string_formatting",
old_pattern="'...' % (...)",
suggested_fix="f'...' or '...'.format(...)",
severity="info",
description="Consider using f-strings or .format() instead of % formatting",
)
)
# Check for .format() that could be f-string
if re.search(r'["\'].*\{.*\}.*["\']\.format\(', line):
self.issues.append(
ModernizationIssue(
file_path=self.file_path,
line_number=i,
column=0,
issue_type="format_to_fstring",
old_pattern="'...{}'.format(...)",
suggested_fix="f'...{...}'",
severity="info",
description="Consider using f-strings instead of .format() for better readability",
)
)
def _check_exception_patterns(self) -> None:
"""Check for old-style exception handling."""
for i, line in enumerate(self.content_lines, 1):
# Check for bare except
if re.search(r"except\s*:", line.strip()):
self.issues.append(
ModernizationIssue(
file_path=self.file_path,
line_number=i,
column=0,
issue_type="bare_except",
old_pattern="except:",
suggested_fix="except Exception:",
severity="warning",
description="Use specific exception types instead of bare except",
)
)
def _check_super_patterns(self) -> None:
"""Check for old-style super() calls."""
for i, line in enumerate(self.content_lines, 1):
# Check for old-style super calls
if re.search(r"super\(\s*\w+\s*,\s*self\s*\)", line):
self.issues.append(
ModernizationIssue(
file_path=self.file_path,
line_number=i,
column=0,
issue_type="old_super_call",
old_pattern="super(ClassName, self)",
suggested_fix="super()",
severity="info",
description="Use super() without arguments (Python 3+)",
)
)
def _is_dunder_method(self, name: str) -> bool:
"""Check if method name is a dunder method."""
return name.startswith("__") and name.endswith("__")
class PydanticAnalyzer:
"""Analyzes Pydantic usage patterns and migration opportunities."""
V1_PATTERNS = {
# Model configuration patterns
r"class\s+Config:": "Use model_config instead of Config class (Pydantic v2)",
# Field patterns
r"Field\([^)]*allow_mutation=": "allow_mutation is deprecated, use frozen instead",
r"Field\([^)]*regex=": "regex parameter is deprecated, use pattern instead",
r"Field\([^)]*min_length=": "Consider using StringConstraints for string validation",
r"Field\([^)]*max_length=": "Consider using StringConstraints for string validation",
# Validator patterns
r"@validator": "@validator is deprecated, use @field_validator instead",
r"@root_validator": "@root_validator is deprecated, use @model_validator instead",
r"pre=True": "pre parameter syntax changed in Pydantic v2",
# Model methods
r"\.dict\(\)": "Use .model_dump() instead of .dict() (Pydantic v2)",
r"\.json\(\)": "Use .model_dump_json() instead of .json() (Pydantic v2)",
r"\.parse_obj\(": "Use model_validate() instead of parse_obj() (Pydantic v2)",
r"\.parse_raw\(": "Use model_validate_json() instead of parse_raw() (Pydantic v2)",
r"\.schema\(\)": "Use model_json_schema() instead of schema() (Pydantic v2)",
r"\.copy\(\)": "Use model_copy() instead of copy() (Pydantic v2)",
# Import patterns
r"from pydantic import.*BaseSettings": "BaseSettings moved to pydantic-settings package",
}
# Pydantic v2 methods that should NEVER be flagged as issues when used with model classes
V2_METHODS = {
"model_validate",
"model_validate_json",
"model_dump",
"model_dump_json",
"model_copy",
"model_json_schema",
"model_rebuild",
"model_fields",
"model_fields_set",
"model_computed_fields",
"model_config",
"model_extra",
}
INTENTIONAL_V1_CONTEXTS = {
# These patterns suggest intentional v1 usage that might be needed
"pydantic.v1", # Explicit v1 import
"pydantic_v1", # Common alias for v1
"__pydantic_model__", # v1 compatibility marker
"model_rebuild", # Sometimes used in migration contexts
"# pydantic v1", # Comment indicating intentional v1 usage
"# TODO: migrate", # Comment indicating planned migration
}
def __init__(self, file_path: str, content: str):
self.file_path = file_path
self.content = content
self.content_lines = content.splitlines()
self.issues: list[ModernizationIssue] = []
def analyze(self) -> list[ModernizationIssue]:
"""Analyze Pydantic usage patterns."""
has_pydantic_import = self._has_pydantic_import()
if not has_pydantic_import:
return []
# Check if this looks like intentional v1 usage
is_intentional_v1 = self._is_intentional_v1_usage()
for i, line in enumerate(self.content_lines, 1):
# Skip lines that contain valid Pydantic v2 patterns
if self._is_valid_v2_pattern(line):
continue
for pattern, description in self.V1_PATTERNS.items():
if re.search(pattern, line):
severity = "info" if is_intentional_v1 else "warning"
# Determine suggested fix based on pattern
suggested_fix = self._get_suggested_fix(pattern, line)
self.issues.append(
ModernizationIssue(
file_path=self.file_path,
line_number=i,
column=0,
issue_type="pydantic_v1_pattern",
old_pattern=pattern,
suggested_fix=suggested_fix,
severity=severity,
description=description,
can_auto_fix=pattern
in [r"\.dict\(\)", r"\.json\(\)", r"\.copy\(\)"],
)
)
return self.issues
def _has_pydantic_import(self) -> bool:
"""Check if file imports Pydantic."""
return any(
"pydantic" in line for line in self.content_lines[:20]
) # Check first 20 lines
def _is_intentional_v1_usage(self) -> bool:
"""Check if this appears to be intentional v1 usage."""
content_lower = self.content.lower()
return any(context in content_lower for context in self.INTENTIONAL_V1_CONTEXTS)
def _is_valid_v2_pattern(self, line: str) -> bool:
"""Check if line contains valid Pydantic v2 patterns that should not be flagged."""
# Check if line contains any valid v2 methods
return any(f".{v2_method}(" in line for v2_method in self.V2_METHODS)
def _get_suggested_fix(self, pattern: str, line: str) -> str:
"""Get suggested fix for a Pydantic pattern."""
fixes = {
r"\.dict\(\)": line.replace(".dict()", ".model_dump()"),
r"\.json\(\)": line.replace(".json()", ".model_dump_json()"),
r"\.copy\(\)": line.replace(".copy()", ".model_copy()"),
r"@validator": line.replace("@validator", "@field_validator"),
r"@root_validator": line.replace("@root_validator", "@model_validator"),
}
for fix_pattern, fix_line in fixes.items():
if re.search(fix_pattern, line):
return fix_line.strip()
return "See Pydantic v2 migration guide"
class ModernizationEngine:
"""Main engine for running modernization analysis."""
def __init__(self, config: QualityConfig | None = None):
self.config = config or QualityConfig()
# Import here to avoid circular imports
self.exception_filter = ExceptionFilter(self.config)
def analyze_file(self, file_path: Path) -> list[ModernizationIssue]:
"""Analyze a single file for modernization opportunities."""
try:
with open(file_path, encoding="utf-8") as f:
content = f.read()
except (OSError, UnicodeDecodeError):
return []
issues = []
# Python modernization analysis
python_analyzer = ModernizationAnalyzer(str(file_path), content, self.config)
issues.extend(python_analyzer.analyze())
# Pydantic analysis
pydantic_analyzer = PydanticAnalyzer(str(file_path), content)
issues.extend(pydantic_analyzer.analyze())
return issues
def analyze_files(
self, file_paths: list[Path]
) -> dict[Path, list[ModernizationIssue]]:
"""Analyze multiple files for modernization opportunities."""
results = {}
for file_path in file_paths:
if file_path.suffix.lower() == ".py":
issues = self.analyze_file(file_path)
# Apply exception filtering
filtered_issues = self.exception_filter.filter_issues(
"modernization",
issues,
get_file_path_fn=lambda issue: issue.file_path,
get_line_number_fn=lambda issue: issue.line_number,
get_issue_type_fn=lambda issue: issue.issue_type,
get_line_content_fn=lambda issue: self._get_line_content(
issue.file_path, issue.line_number
),
)
if filtered_issues: # Only include files with remaining issues
results[file_path] = filtered_issues
return results
def _get_line_content(self, file_path: str, line_number: int) -> str:
"""Get the content of a specific line from a file."""
try:
with open(file_path, encoding="utf-8") as f:
lines = f.readlines()
if 1 <= line_number <= len(lines):
return lines[line_number - 1].strip()
except (OSError, UnicodeDecodeError):
pass
return ""
def get_summary(
self, results: dict[Path, list[ModernizationIssue]]
) -> dict[str, Any]:
"""Generate summary of modernization analysis."""
all_issues = []
for issues in results.values():
if issues is not None:
all_issues.extend(issues)
# Group by issue type
by_type: dict[str, list[ModernizationIssue]] = {}
by_severity = {"error": 0, "warning": 0, "info": 0}
for issue in all_issues:
by_type.setdefault(issue.issue_type, []).append(issue)
by_severity[issue.severity] += 1
# Top files with most issues
file_counts = {}
for file_path, issues in results.items():
if issues:
file_counts[file_path] = len(issues)
top_files = sorted(file_counts.items(), key=lambda x: x[1], reverse=True)[:10]
# Auto-fixable issues
auto_fixable = sum(1 for issue in all_issues if issue.can_auto_fix)
return {
"total_files_analyzed": len(results),
"files_with_issues": len(
[
f
for f, issues in results.items()
if issues is not None and len(issues) > 0
]
),
"total_issues": len(all_issues),
"by_severity": by_severity,
"by_type": {k: len(v) for k, v in by_type.items()},
"auto_fixable_count": auto_fixable,
"top_files_with_issues": [(str(f), count) for f, count in top_files],
"recommendations": self._generate_recommendations(by_type, by_severity),
}
def _generate_recommendations(
self, by_type: dict[str, list[ModernizationIssue]], by_severity: dict[str, int]
) -> list[str]:
"""Generate recommendations based on analysis results."""
recommendations = []
# Handle new typing import issue types
replaceable_count = len(by_type.get("replaceable_typing_import", []))
collections_count = len(by_type.get("collections_typing_import", []))
moved_count = len(by_type.get("moved_typing_import", []))
if replaceable_count > 0:
recommendations.append(
f"🔄 Update {replaceable_count} typing imports to use modern built-in types (Python 3.9+)"
)
if collections_count > 0:
recommendations.append(
f"📦 Update {collections_count} typing imports to use collections module"
)
if moved_count > 0:
recommendations.append(
f"🔀 Update {moved_count} typing imports that moved to other modules"
)
# Handle typing usage issues
usage_count = len(by_type.get("typing_usage", []))
moved_usage_count = len(by_type.get("moved_typing_usage", []))
if usage_count > 0:
recommendations.append(
f"⚡ Modernize {usage_count} type annotations to use built-ins or | union syntax"
)
if moved_usage_count > 0:
recommendations.append(
f"🔀 Update {moved_usage_count} type annotations that moved to other modules"
)
# Keep existing recommendations for other issue types
if "pydantic_v1_pattern" in by_type:
count = len(by_type["pydantic_v1_pattern"])
recommendations.append(f"📦 Migrate {count} Pydantic v1 patterns to v2 API")
if "old_string_formatting" in by_type:
count = len(by_type["old_string_formatting"])
recommendations.append(
f"✨ Replace {count} old string formatting patterns with f-strings"
)
if "bare_except" in by_type:
count = len(by_type["bare_except"])
recommendations.append(
f"⚠️ Fix {count} bare except clauses for better error handling"
)
if by_severity["warning"] > 10:
recommendations.append(
f"🚨 Address {by_severity['warning']} warning-level issues for better code quality"
)
return recommendations

View File

@@ -0,0 +1 @@
"""CLI interface for the quality analysis package."""

691
src/quality/cli/main.py Normal file
View File

@@ -0,0 +1,691 @@
#!/usr/bin/env python3
"""Main CLI interface for code quality analysis."""
import ast
import csv
import json
import sys
from pathlib import Path
from typing import Any
import click
from ..analyzers.modernization import ModernizationEngine
from ..complexity.analyzer import ComplexityAnalyzer
from ..config.schemas import QualityConfig, _load_from_yaml, load_config
from ..core.ast_analyzer import ASTAnalyzer
from ..core.exceptions import create_exceptions_config_template
from ..detection.engine import DuplicateDetectionEngine
from ..utils.file_finder import FileFinder
@click.group()
@click.option(
"--config",
"-c",
type=click.Path(exists=True, path_type=Path),
help="Path to configuration file",
)
@click.option(
"--exceptions-file",
"-e",
type=click.Path(exists=True, path_type=Path),
help="Path to exceptions configuration file",
)
@click.option("--verbose", "-v", is_flag=True, help="Enable verbose output")
@click.pass_context
def cli(
ctx: click.Context, config: Path | None, exceptions_file: Path | None, verbose: bool
) -> None:
"""Code quality analysis toolkit."""
ctx.ensure_object(dict)
# Load configuration
quality_config = load_config(config)
quality_config.verbose = verbose
# Load exceptions configuration if provided
if exceptions_file:
exceptions_data = _load_from_yaml(exceptions_file)
if hasattr(exceptions_data, "exceptions"):
quality_config.exceptions = exceptions_data.exceptions
ctx.obj["config"] = quality_config
ctx.obj["verbose"] = verbose
@cli.command()
@click.argument(
"paths", nargs=-1, required=True, type=click.Path(exists=True, path_type=Path)
)
@click.option("--threshold", "-t", default=0.8, help="Similarity threshold (0.0-1.0)")
@click.option("--min-lines", default=5, help="Minimum lines for duplicate detection")
@click.option("--min-tokens", default=50, help="Minimum tokens for duplicate detection")
@click.option("--output", "-o", type=click.File("w"), help="Output file for results")
@click.option(
"--format",
"output_format",
default="json",
type=click.Choice(["json", "console", "csv"]),
help="Output format",
)
@click.pass_context
def duplicates(
ctx: click.Context,
paths: tuple[Path],
threshold: float,
min_lines: int,
min_tokens: int,
output: Any,
output_format: str,
) -> None:
"""Detect duplicate code patterns."""
config: QualityConfig = ctx.obj["config"]
verbose: bool = ctx.obj["verbose"]
# Update config with CLI options
config.detection.similarity_threshold = threshold
config.detection.min_lines = min_lines
config.detection.min_tokens = min_tokens
if verbose:
click.echo(f"🔍 Analyzing paths: {', '.join(str(p) for p in paths)}")
click.echo(f"📊 Similarity threshold: {threshold}")
click.echo(f"📏 Min lines: {min_lines}, Min tokens: {min_tokens}")
# Find Python files
file_finder = FileFinder(config.paths, config.languages)
all_files = []
for path in paths:
if path.is_file():
all_files.append(path)
else:
files = file_finder.find_files(path)
all_files.extend(files)
if not all_files:
click.echo("❌ No Python files found in the specified paths.", err=True)
return
if verbose:
click.echo(f"📂 Found {len(all_files)} Python files")
# Run duplicate detection
engine = DuplicateDetectionEngine(config)
duplicates_found = engine.detect_duplicates_in_files(all_files)
if verbose:
click.echo(f"🔍 Found {len(duplicates_found)} duplicate groups")
# Generate output
results: dict[str, Any] = {
"summary": {
"total_files_analyzed": len(all_files),
"duplicate_groups_found": len(duplicates_found),
"total_duplicate_blocks": sum(
len(match.blocks) for match in duplicates_found
),
"configuration": {
"similarity_threshold": threshold,
"min_lines": min_lines,
"min_tokens": min_tokens,
},
},
"duplicates": [],
}
for i, match in enumerate(duplicates_found, 1):
detailed_analysis = engine.get_detailed_analysis(match)
results["duplicates"].append({"group_id": i, "analysis": detailed_analysis})
# Output results
if output_format == "json":
if output:
json.dump(results, output, indent=2, default=str)
else:
click.echo(json.dumps(results, indent=2, default=str))
elif output_format == "console":
_print_console_duplicates(results, verbose)
elif output_format == "csv":
_print_csv_duplicates(results, output)
@cli.command()
@click.argument(
"paths", nargs=-1, required=True, type=click.Path(exists=True, path_type=Path)
)
@click.option("--threshold", default=10, help="Complexity threshold")
@click.option("--output", "-o", type=click.File("w"), help="Output file for results")
@click.option(
"--format",
"output_format",
default="json",
type=click.Choice(["json", "console"]),
help="Output format",
)
@click.pass_context
def complexity(
ctx: click.Context,
paths: tuple[Path],
threshold: int,
output: Any,
output_format: str,
) -> None:
"""Analyze code complexity."""
config: QualityConfig = ctx.obj["config"]
verbose: bool = ctx.obj["verbose"]
config.complexity.complexity_threshold = threshold
if verbose:
click.echo(f"🔍 Analyzing complexity in: {', '.join(str(p) for p in paths)}")
click.echo(f"📊 Complexity threshold: {threshold}")
# Find Python files
file_finder = FileFinder(config.paths, config.languages)
all_files = []
for path in paths:
if path.is_file():
all_files.append(path)
else:
files = file_finder.find_files(path)
all_files.extend(files)
if not all_files:
click.echo("❌ No Python files found in the specified paths.", err=True)
return
if verbose:
click.echo(f"📂 Found {len(all_files)} Python files")
# Run complexity analysis
analyzer = ComplexityAnalyzer(config.complexity)
overview = analyzer.get_project_complexity_overview(all_files)
# Output results
if output_format == "json":
if output:
json.dump(overview, output, indent=2, default=str)
else:
click.echo(json.dumps(overview, indent=2, default=str))
elif output_format == "console":
_print_console_complexity(overview, verbose)
@cli.command()
@click.argument(
"paths", nargs=-1, required=True, type=click.Path(exists=True, path_type=Path)
)
@click.option(
"--include-type-hints", is_flag=True, help="Include missing type hint analysis"
)
@click.option("--pydantic-only", is_flag=True, help="Only analyze Pydantic patterns")
@click.option("--output", "-o", type=click.File("w"), help="Output file for results")
@click.option(
"--format",
"output_format",
default="json",
type=click.Choice(["json", "console"]),
help="Output format",
)
@click.pass_context
def modernization(
ctx: click.Context,
paths: tuple[Path],
include_type_hints: bool,
pydantic_only: bool,
output: Any,
output_format: str,
) -> None:
"""Analyze code for modernization opportunities."""
config: QualityConfig = ctx.obj["config"]
verbose: bool = ctx.obj["verbose"]
if verbose:
click.echo(
f"🔍 Analyzing modernization opportunities in: {', '.join(str(p) for p in paths)}"
)
if include_type_hints:
click.echo("📝 Including type hint analysis")
if pydantic_only:
click.echo("📦 Pydantic-only analysis mode")
# Find Python files
file_finder = FileFinder(config.paths, config.languages)
all_files = []
for path in paths:
if path.is_file():
all_files.append(path)
else:
files = file_finder.find_files(path)
all_files.extend(files)
if not all_files:
click.echo("❌ No Python files found in the specified paths.", err=True)
return
if verbose:
click.echo(f"📂 Found {len(all_files)} Python files")
# Run modernization analysis
engine = ModernizationEngine(config)
results = engine.analyze_files(all_files)
summary = engine.get_summary(results)
# Filter results if needed
if pydantic_only:
filtered_results = {}
for file_path, issues in results.items():
pydantic_issues = [
issue for issue in issues if issue.issue_type == "pydantic_v1_pattern"
]
if pydantic_issues:
filtered_results[file_path] = pydantic_issues
results = filtered_results
# Recalculate summary
summary = engine.get_summary(results)
# Output results
final_results = {
"summary": summary,
"files": {
str(file_path): [issue.__dict__ for issue in issues]
for file_path, issues in results.items()
if issues
},
}
if output_format == "json":
if output:
json.dump(final_results, output, indent=2, default=str)
else:
click.echo(json.dumps(final_results, indent=2, default=str))
elif output_format == "console":
_print_console_modernization(final_results, verbose, include_type_hints)
@cli.command()
@click.argument(
"paths", nargs=-1, required=True, type=click.Path(exists=True, path_type=Path)
)
@click.option("--output", "-o", type=click.File("w"), help="Output file for results")
@click.option(
"--format",
"output_format",
default="json",
type=click.Choice(["json", "console"]),
help="Output format",
)
@click.pass_context
def full_analysis(
ctx: click.Context, paths: tuple[Path], output: Any, output_format: str
) -> None:
"""Run comprehensive code quality analysis."""
config: QualityConfig = ctx.obj["config"]
verbose: bool = ctx.obj["verbose"]
if verbose:
click.echo(
f"🔍 Running full quality analysis on: {', '.join(str(p) for p in paths)}"
)
# Find Python files
file_finder = FileFinder(config.paths, config.languages)
all_files = []
for path in paths:
if path.is_file():
all_files.append(path)
else:
files = file_finder.find_files(path)
all_files.extend(files)
if not all_files:
click.echo("❌ No Python files found in the specified paths.", err=True)
return
if verbose:
click.echo(f"📂 Found {len(all_files)} Python files")
# Run all analyses
results: dict[str, Any] = {
"metadata": {
"total_files": len(all_files),
"analyzed_paths": [str(p) for p in paths],
"configuration": config.dict(),
}
}
# Complexity analysis
if verbose:
click.echo("📊 Running complexity analysis...")
complexity_analyzer = ComplexityAnalyzer(config.complexity)
results["complexity"] = complexity_analyzer.get_project_complexity_overview(
all_files
)
# Duplicate detection
if verbose:
click.echo("🔍 Running duplicate detection...")
duplicate_engine = DuplicateDetectionEngine(config)
duplicates_found = duplicate_engine.detect_duplicates_in_files(all_files)
results["duplicates"] = {
"summary": {
"duplicate_groups_found": len(duplicates_found),
"total_duplicate_blocks": sum(
len(match.blocks) for match in duplicates_found
),
},
"details": [],
}
for i, match in enumerate(duplicates_found, 1):
detailed_analysis = duplicate_engine.get_detailed_analysis(match)
duplicate_details = results["duplicates"]["details"]
if isinstance(duplicate_details, list):
duplicate_details.append({"group_id": i, "analysis": detailed_analysis})
# Code smells detection
if verbose:
click.echo("👃 Detecting code smells...")
all_smells = []
for file_path in all_files:
try:
with open(file_path, encoding="utf-8") as f:
content = f.read()
ast_analyzer = ASTAnalyzer(str(file_path), content)
# Parse the AST and analyze
tree = ast.parse(content)
ast_analyzer.visit(tree)
smells = ast_analyzer.detect_code_smells()
if smells:
all_smells.extend(
[{"file": str(file_path), "smell": smell} for smell in smells]
)
except Exception:
continue
results["code_smells"] = {"total_smells": len(all_smells), "details": all_smells}
# Generate overall quality score
results["quality_score"] = _calculate_overall_quality_score(results)
# Output results
if output_format == "json":
if output:
json.dump(results, output, indent=2, default=str)
else:
click.echo(json.dumps(results, indent=2, default=str))
elif output_format == "console":
_print_console_full_analysis(results, verbose)
def _print_console_duplicates(results: dict[str, Any], verbose: bool) -> None:
"""Print duplicate results in console format."""
summary = results["summary"]
click.echo("\n🔍 DUPLICATE CODE ANALYSIS")
click.echo("=" * 50)
click.echo(f"📂 Files analyzed: {summary['total_files_analyzed']}")
click.echo(f"🔄 Duplicate groups: {summary['duplicate_groups_found']}")
click.echo(f"📊 Total duplicate blocks: {summary['total_duplicate_blocks']}")
if not results["duplicates"]:
click.echo("\n✅ No significant duplicate code patterns found!")
return
click.echo(f"\n🚨 Found {len(results['duplicates'])} duplicate groups:")
for dup in results["duplicates"]:
analysis = dup["analysis"]
match_info = analysis["match_info"]
click.echo(f"\n📋 Group #{dup['group_id']}")
click.echo(f" Similarity: {match_info['similarity_score']:.2%}")
click.echo(f" Priority: {match_info['priority_score']:.2f}")
click.echo(f" Type: {match_info['match_type']}")
click.echo(" 📁 Affected files:")
for block in analysis["blocks"]:
click.echo(f"{block['file_path']} (lines {block['line_range']})")
if verbose and analysis["refactoring_suggestions"]:
click.echo(" 💡 Refactoring suggestions:")
for suggestion in analysis["refactoring_suggestions"]:
click.echo(f"{suggestion}")
def _print_csv_duplicates(results: dict[str, Any], output: Any) -> None:
"""Print duplicate results in CSV format."""
if not output:
output = sys.stdout
writer = csv.writer(output)
writer.writerow(
[
"Group ID",
"Similarity Score",
"Priority Score",
"Match Type",
"File Path",
"Line Range",
"Lines of Code",
"Estimated Effort",
"Risk Level",
]
)
for dup in results["duplicates"]:
analysis = dup["analysis"]
match_info = analysis["match_info"]
for block in analysis["blocks"]:
writer.writerow(
[
dup["group_id"],
f"{match_info['similarity_score']:.2%}",
f"{match_info['priority_score']:.2f}",
match_info["match_type"],
block["file_path"],
block["line_range"],
block["lines_of_code"],
analysis.get("estimated_effort", "Unknown"),
analysis.get("risk_assessment", "Unknown"),
]
)
def _print_console_complexity(results: dict[str, Any], verbose: bool) -> None:
"""Print complexity results in console format."""
click.echo("\n📊 COMPLEXITY ANALYSIS")
click.echo("=" * 50)
summary = results["summary"]
click.echo(f"📂 Total files: {results['total_files']}")
click.echo(f"📏 Total lines: {results['total_lines_of_code']}")
click.echo(f"⚙️ Total functions: {results['total_functions']}")
click.echo(f"🏗️ Total classes: {results['total_classes']}")
click.echo("\n📈 Average metrics:")
click.echo(f" Complexity score: {summary['average_complexity_score']}")
click.echo(f" Cyclomatic complexity: {summary['average_cyclomatic_complexity']}")
click.echo(f" Maintainability index: {summary['average_maintainability_index']}")
click.echo("\n📊 Complexity distribution:")
for level, count in results["distribution"].items():
click.echo(f" {level}: {count} files")
if results["high_complexity_files"]:
click.echo(
f"\n🚨 High complexity files (top {len(results['high_complexity_files'])}):"
)
for file_info in results["high_complexity_files"]:
click.echo(
f"{file_info['file']} (score: {file_info['score']:.1f}, level: {file_info['level']})"
)
if results["recommendations"]:
click.echo("\n💡 Recommendations:")
for rec in results["recommendations"]:
click.echo(f" {rec}")
def _print_console_modernization(
results: dict[str, Any], verbose: bool, include_type_hints: bool
) -> None:
"""Print modernization results in console format."""
summary = results["summary"]
click.echo("\n🔄 MODERNIZATION ANALYSIS")
click.echo("=" * 50)
click.echo(f"📂 Files analyzed: {summary['total_files_analyzed']}")
click.echo(f"⚠️ Files with issues: {summary['files_with_issues']}")
click.echo(f"🔧 Total issues: {summary['total_issues']}")
click.echo(f"✅ Auto-fixable: {summary['auto_fixable_count']}")
click.echo("\n📊 Issues by severity:")
for severity, count in summary["by_severity"].items():
if count > 0:
icon = (
"🚨" if severity == "error" else "⚠️" if severity == "warning" else ""
)
click.echo(f" {icon} {severity.title()}: {count}")
click.echo("\n📋 Issues by type:")
for issue_type, count in summary["by_type"].items():
click.echo(f"{issue_type.replace('_', ' ').title()}: {count}")
if summary["top_files_with_issues"]:
click.echo("\n🗂️ Files with most issues:")
for file_path, count in summary["top_files_with_issues"][:5]:
click.echo(f"{file_path}: {count} issues")
if summary["recommendations"]:
click.echo("\n💡 Recommendations:")
for rec in summary["recommendations"]:
click.echo(f" {rec}")
if verbose and results["files"]:
click.echo("\n📝 Detailed issues:")
for file_path, issues in list(results["files"].items())[:5]: # Show top 5 files
click.echo(f"\n 📁 {file_path}:")
for issue in issues[:3]: # Show first 3 issues per file
severity_icon = (
"🚨"
if issue["severity"] == "error"
else "⚠️"
if issue["severity"] == "warning"
else ""
)
click.echo(
f" {severity_icon} Line {issue['line_number']}: {issue['description']}"
)
if issue["can_auto_fix"]:
click.echo(f" 🔧 Suggested fix: {issue['suggested_fix']}")
if len(issues) > 3:
click.echo(f" ... and {len(issues) - 3} more issues")
def _print_console_full_analysis(results: dict[str, Any], verbose: bool) -> None:
"""Print full analysis results in console format."""
click.echo("\n🎯 COMPREHENSIVE CODE QUALITY ANALYSIS")
click.echo("=" * 60)
metadata = results["metadata"]
click.echo(f"📂 Total files analyzed: {metadata['total_files']}")
click.echo(f"📍 Paths: {', '.join(metadata['analyzed_paths'])}")
click.echo(f"🎯 Overall quality score: {results['quality_score']:.1f}/100")
# Complexity summary
complexity = results["complexity"]
click.echo("\n📊 COMPLEXITY METRICS")
click.echo(f" Average score: {complexity['summary']['average_complexity_score']}")
click.echo(f" High complexity files: {len(complexity['high_complexity_files'])}")
# Duplicates summary
duplicates = results["duplicates"]
click.echo("\n🔄 DUPLICATE DETECTION")
click.echo(
f" Duplicate groups: {duplicates['summary']['duplicate_groups_found']}"
)
click.echo(
f" Total duplicate blocks: {duplicates['summary']['total_duplicate_blocks']}"
)
# Code smells summary
smells = results["code_smells"]
click.echo("\n👃 CODE SMELLS")
click.echo(f" Total issues: {smells['total_smells']}")
if verbose and smells["details"]:
click.echo(" Details:")
for smell in smells["details"][:10]: # Show first 10
click.echo(f"{smell['file']}: {smell['smell']}")
if len(smells["details"]) > 10:
click.echo(f" ... and {len(smells['details']) - 10} more")
def _calculate_overall_quality_score(results: dict[str, Any]) -> float:
"""Calculate an overall quality score based on all metrics."""
score = 100.0
# Complexity penalty (max -30 points)
complexity = results["complexity"]
avg_complexity = complexity["summary"]["average_complexity_score"]
if avg_complexity > 50:
score -= min(30, (avg_complexity - 50) * 0.6)
# Duplicate penalty (max -30 points)
duplicates = results["duplicates"]
if duplicates["summary"]["duplicate_groups_found"] > 0:
penalty = min(30, duplicates["summary"]["duplicate_groups_found"] * 3)
score -= penalty
# Code smells penalty (max -20 points)
smells = results["code_smells"]
if smells["total_smells"] > 0:
penalty = min(20, smells["total_smells"] * 2)
score -= penalty
# Maintainability bonus/penalty (max ±20 points)
avg_maintainability = complexity["summary"]["average_maintainability_index"]
if avg_maintainability > 70:
score += min(20.0, (avg_maintainability - 70) * 0.5)
elif avg_maintainability < 30:
score -= min(20.0, (30 - avg_maintainability) * 0.5)
return max(0.0, score)
@cli.command()
@click.option(
"--output-path",
"-o",
default=".quality-exceptions.yaml",
type=click.Path(path_type=Path),
help="Output path for exceptions configuration file",
)
def create_exceptions_template(output_path: Path) -> None:
"""Create a template exceptions configuration file."""
template_content = create_exceptions_config_template()
if output_path.exists() and not click.confirm(
f"File {output_path} already exists. Overwrite?"
):
click.echo("Aborted.")
return
with open(output_path, "w", encoding="utf-8") as f:
f.write(template_content)
click.echo(f"✅ Created exceptions configuration template at: {output_path}")
click.echo("📝 Edit this file to configure exception rules for your project")
click.echo(f"🔧 Use with: --exceptions-file {output_path}")
if __name__ == "__main__":
cli()

View File

@@ -0,0 +1,13 @@
"""Code complexity analysis module."""
from .analyzer import ComplexityAnalyzer
from .calculator import ComplexityCalculator
from .metrics import ComplexityMetrics
from .radon_integration import RadonComplexityAnalyzer
__all__ = [
"ComplexityAnalyzer",
"ComplexityCalculator",
"ComplexityMetrics",
"RadonComplexityAnalyzer",
]

View File

@@ -0,0 +1,311 @@
"""High-level complexity analysis interface."""
from pathlib import Path
from typing import Any
from .metrics import ComplexityMetrics
from .radon_integration import RadonComplexityAnalyzer
from ..config.schemas import ComplexityConfig
# TYPE_CHECKING import to avoid circular imports
try:
from ..core.exceptions import ExceptionFilter
except ImportError:
ExceptionFilter = None
class ComplexityAnalyzer:
"""High-level interface for code complexity analysis."""
def __init__(self, config: ComplexityConfig | None = None, full_config: Any = None):
self.config = config or ComplexityConfig()
self.radon_analyzer = RadonComplexityAnalyzer(fallback_to_manual=True)
# Initialize exception filter if full config provided
if full_config:
from ..core.exceptions import ExceptionFilter
self.exception_filter: ExceptionFilter | None = ExceptionFilter(full_config)
else:
self.exception_filter: ExceptionFilter | None = None
def analyze_code(self, code: str, filename: str = "<string>") -> ComplexityMetrics:
"""Analyze complexity of code string."""
metrics = self.radon_analyzer.analyze_code(code, filename)
return self._filter_metrics_by_config(metrics)
def analyze_file(self, file_path: Path) -> ComplexityMetrics:
"""Analyze complexity of a file."""
metrics = self.radon_analyzer.analyze_file(file_path)
return self._filter_metrics_by_config(metrics)
def batch_analyze_files(
self, file_paths: list[Path], max_workers: int | None = None
) -> dict[Path, ComplexityMetrics]:
"""Analyze multiple files in parallel."""
raw_results = self.radon_analyzer.batch_analyze_files(file_paths, max_workers)
# Filter metrics based on configuration
filtered_results = {}
for path, metrics in raw_results.items():
filtered_results[path] = self._filter_metrics_by_config(metrics)
return filtered_results
def get_complexity_summary(self, metrics: ComplexityMetrics) -> dict[str, Any]:
"""Get a human-readable summary of complexity metrics."""
return {
"overall_score": metrics.get_overall_score(),
"complexity_level": metrics.get_complexity_level(),
"priority_score": metrics.get_priority_score(),
"recommendations": metrics.get_recommendations(),
"key_metrics": {
"cyclomatic_complexity": metrics.cyclomatic_complexity,
"cognitive_complexity": metrics.cognitive_complexity,
"maintainability_index": metrics.maintainability_index,
"max_nesting_depth": metrics.max_nesting_depth,
"lines_of_code": metrics.lines_of_code,
"function_count": metrics.function_count,
"class_count": metrics.class_count,
},
"flags": self._get_complexity_flags(metrics),
}
def get_detailed_report(
self, code: str, filename: str = "<string>"
) -> dict[str, Any]:
"""Get detailed complexity report including function-level analysis."""
report = self.radon_analyzer.get_detailed_complexity_report(code, filename)
# Add summary information
if "file_metrics" in report:
metrics = ComplexityMetrics.from_dict(report["file_metrics"])
report["summary"] = self.get_complexity_summary(metrics)
# Filter functions and classes that exceed thresholds
if "functions" in report:
report["high_complexity_functions"] = [
func
for func in report["functions"]
if func["complexity"] >= self.config.complexity_threshold
]
return report
def find_complex_code(
self, file_paths: list[Path], max_workers: int | None = None
) -> list[dict[str, Any]]:
"""Find code blocks that exceed complexity thresholds."""
results = self.batch_analyze_files(file_paths, max_workers)
complex_files = []
for path, metrics in results.items():
if self._is_complex(metrics):
# Check if this complexity issue should be suppressed
if self.exception_filter:
should_suppress, reason = (
self.exception_filter.should_suppress_issue(
"complexity", "high_complexity", str(path), 1, ""
)
)
if should_suppress:
continue
summary = self.get_complexity_summary(metrics)
complex_files.append(
{
"file_path": str(path),
"metrics": metrics.to_dict(),
"summary": summary,
"priority": summary["priority_score"],
}
)
# Sort by priority (highest first)
complex_files.sort(key=lambda x: x["priority"], reverse=True)
return complex_files
def get_project_complexity_overview(
self, file_paths: list[Path], max_workers: int | None = None
) -> dict[str, Any]:
"""Get overall project complexity statistics."""
results = self.batch_analyze_files(file_paths, max_workers)
if not results:
return {
"total_files": 0,
"summary": {},
"distribution": {},
"recommendations": [],
}
# Aggregate statistics
total_files = len(results)
total_lines = sum(m.lines_of_code for m in results.values())
total_functions = sum(m.function_count for m in results.values())
total_classes = sum(m.class_count for m in results.values())
# Complexity distribution
complexity_levels = {
"Low": 0,
"Moderate": 0,
"High": 0,
"Very High": 0,
"Extreme": 0,
}
high_complexity_files = []
for path, metrics in results.items():
level = metrics.get_complexity_level()
complexity_levels[level] += 1
if metrics.get_overall_score() >= 50: # High complexity threshold
high_complexity_files.append(
{
"file": str(path),
"score": metrics.get_overall_score(),
"level": level,
}
)
# Sort high complexity files by score
high_complexity_files.sort(key=lambda x: x["score"], reverse=True)
# Project-level recommendations
recommendations = []
if complexity_levels["Extreme"] > 0:
recommendations.append(
f"🚨 {complexity_levels['Extreme']} files with extreme complexity need immediate attention"
)
if complexity_levels["Very High"] > 0:
recommendations.append(
f"⚠️ {complexity_levels['Very High']} files with very high complexity should be refactored"
)
if total_files > 0:
avg_complexity = (
sum(m.get_overall_score() for m in results.values()) / total_files
)
if avg_complexity > 40:
recommendations.append(
"📈 Overall project complexity is high - consider architectural improvements"
)
return {
"total_files": total_files,
"total_lines_of_code": total_lines,
"total_functions": total_functions,
"total_classes": total_classes,
"summary": {
"average_complexity_score": round(
sum(m.get_overall_score() for m in results.values()) / total_files,
2,
)
if total_files > 0
else 0,
"average_cyclomatic_complexity": round(
sum(m.cyclomatic_complexity for m in results.values())
/ total_files,
2,
)
if total_files > 0
else 0,
"average_maintainability_index": round(
sum(m.maintainability_index for m in results.values())
/ total_files,
2,
)
if total_files > 0
else 0,
},
"distribution": complexity_levels,
"high_complexity_files": high_complexity_files[:10], # Top 10
"recommendations": recommendations,
"config": {
"complexity_threshold": self.config.complexity_threshold,
"radon_available": self.radon_analyzer.is_available(),
"metrics_included": {
"cyclomatic_complexity": self.config.include_cyclomatic,
"cognitive_complexity": self.config.include_cognitive,
"halstead_metrics": self.config.include_halstead,
"maintainability_index": self.config.include_maintainability,
},
},
}
def _filter_metrics_by_config(
self, metrics: ComplexityMetrics
) -> ComplexityMetrics:
"""Filter metrics based on configuration settings."""
filtered = ComplexityMetrics()
# Always include basic metrics
filtered.lines_of_code = metrics.lines_of_code
filtered.source_lines_of_code = metrics.source_lines_of_code
filtered.logical_lines_of_code = metrics.logical_lines_of_code
filtered.comment_lines = metrics.comment_lines
filtered.blank_lines = metrics.blank_lines
filtered.function_count = metrics.function_count
filtered.class_count = metrics.class_count
filtered.method_count = metrics.method_count
# Include metrics based on configuration
if self.config.include_cyclomatic:
filtered.cyclomatic_complexity = metrics.cyclomatic_complexity
if self.config.include_cognitive:
filtered.cognitive_complexity = metrics.cognitive_complexity
filtered.max_nesting_depth = metrics.max_nesting_depth
filtered.average_nesting_depth = metrics.average_nesting_depth
if self.config.include_halstead:
filtered.halstead_difficulty = metrics.halstead_difficulty
filtered.halstead_effort = metrics.halstead_effort
filtered.halstead_volume = metrics.halstead_volume
filtered.halstead_time = metrics.halstead_time
filtered.halstead_bugs = metrics.halstead_bugs
if self.config.include_maintainability:
filtered.maintainability_index = metrics.maintainability_index
# Additional metrics
filtered.parameters_count = metrics.parameters_count
filtered.variables_count = metrics.variables_count
filtered.returns_count = metrics.returns_count
return filtered
def _is_complex(self, metrics: ComplexityMetrics) -> bool:
"""Check if code is considered complex based on thresholds."""
return (
metrics.cyclomatic_complexity >= self.config.complexity_threshold
or metrics.cognitive_complexity >= self.config.complexity_threshold * 1.5
or metrics.max_nesting_depth > 4
or metrics.maintainability_index < 20
)
def _get_complexity_flags(self, metrics: ComplexityMetrics) -> list[str]:
"""Get list of complexity warning flags."""
flags = []
if metrics.cyclomatic_complexity > self.config.complexity_threshold:
flags.append("HIGH_CYCLOMATIC_COMPLEXITY")
if metrics.cognitive_complexity > self.config.complexity_threshold * 1.5:
flags.append("HIGH_COGNITIVE_COMPLEXITY")
if metrics.max_nesting_depth > 4:
flags.append("DEEP_NESTING")
if metrics.maintainability_index < 20:
flags.append("LOW_MAINTAINABILITY")
if metrics.halstead_difficulty > 20:
flags.append("HIGH_HALSTEAD_DIFFICULTY")
if metrics.function_count == 0 and metrics.lines_of_code > 50:
flags.append("LARGE_MONOLITHIC_CODE")
if metrics.parameters_count > 5:
flags.append("TOO_MANY_PARAMETERS")
return flags

View File

@@ -0,0 +1,358 @@
"""Manual complexity calculation algorithms."""
import ast
import re
from collections import Counter
from .metrics import ComplexityMetrics
class ComplexityCalculator:
"""Manual complexity calculator using AST analysis."""
def calculate_complexity(self, code: str) -> ComplexityMetrics:
"""Calculate all complexity metrics for given code."""
try:
tree = ast.parse(code)
return self._analyze_ast(tree, code)
except SyntaxError:
# Return basic metrics for malformed code
return self._analyze_text_metrics(code)
def _analyze_ast(self, tree: ast.AST, code: str) -> ComplexityMetrics:
"""Analyze AST to extract complexity metrics."""
metrics = ComplexityMetrics()
# Basic line counts
lines = code.split("\n")
metrics.lines_of_code = len(lines)
metrics.blank_lines = len([line for line in lines if not line.strip()])
metrics.comment_lines = len(
[line for line in lines if line.strip().startswith("#")]
)
metrics.source_lines_of_code = (
metrics.lines_of_code - metrics.blank_lines - metrics.comment_lines
)
# AST-based metrics
for node in ast.walk(tree):
if isinstance(node, ast.FunctionDef):
metrics.function_count += 1
# Count parameters
metrics.parameters_count += len(node.args.args)
# Count returns
metrics.returns_count += len(
[n for n in ast.walk(node) if isinstance(n, ast.Return)]
)
elif isinstance(node, ast.ClassDef):
metrics.class_count += 1
elif isinstance(node, ast.AsyncFunctionDef):
metrics.function_count += 1
metrics.parameters_count += len(node.args.args)
metrics.returns_count += len(
[n for n in ast.walk(node) if isinstance(n, ast.Return)]
)
# Calculate cyclomatic complexity
metrics.cyclomatic_complexity = self._calculate_cyclomatic_complexity(tree)
# Calculate cognitive complexity
metrics.cognitive_complexity = self._calculate_cognitive_complexity(tree)
# Calculate nesting metrics
metrics.max_nesting_depth, metrics.average_nesting_depth = (
self._calculate_nesting_metrics(tree)
)
# Calculate Halstead metrics
halstead = self._calculate_halstead_metrics(tree)
metrics.halstead_difficulty = halstead.get("difficulty", 0.0)
metrics.halstead_volume = halstead.get("volume", 0.0)
metrics.halstead_effort = halstead.get("effort", 0.0)
metrics.halstead_time = halstead.get("time", 0.0)
metrics.halstead_bugs = halstead.get("bugs", 0.0)
# Calculate maintainability index
metrics.maintainability_index = self._calculate_maintainability_index(metrics)
# Logical lines of code (non-empty, non-comment)
metrics.logical_lines_of_code = self._count_logical_lines(tree)
# Count variables
metrics.variables_count = self._count_variables(tree)
# Count methods in classes
metrics.method_count = self._count_methods(tree)
return metrics
def _analyze_text_metrics(self, code: str) -> ComplexityMetrics:
"""Fallback text-based analysis for malformed code."""
metrics = ComplexityMetrics()
lines = code.split("\n")
metrics.lines_of_code = len(lines)
metrics.blank_lines = len([line for line in lines if not line.strip()])
metrics.comment_lines = len(
[line for line in lines if line.strip().startswith("#")]
)
metrics.source_lines_of_code = (
metrics.lines_of_code - metrics.blank_lines - metrics.comment_lines
)
# Basic pattern matching
metrics.function_count = len(re.findall(r"^\s*def\s+\w+", code, re.MULTILINE))
metrics.class_count = len(re.findall(r"^\s*class\s+\w+", code, re.MULTILINE))
return metrics
def _calculate_cyclomatic_complexity(self, tree: ast.AST) -> int:
"""Calculate McCabe cyclomatic complexity."""
complexity = 1 # Base complexity
for node in ast.walk(tree):
if isinstance(
node,
(
ast.If,
ast.While,
ast.For,
ast.AsyncFor,
ast.ExceptHandler,
ast.With,
ast.Assert,
),
):
complexity += 1
elif isinstance(node, ast.BoolOp):
# Add complexity for boolean operations (and, or)
complexity += len(node.values) - 1
elif isinstance(node, ast.Expr) and isinstance(node.value, ast.IfExp):
# Ternary operator
complexity += 1
return complexity
def _calculate_cognitive_complexity(self, tree: ast.AST) -> int:
"""Calculate cognitive complexity (similar to SonarQube)."""
complexity = 0
def visit_node(node: ast.AST, depth: int = 0) -> int:
nonlocal complexity
local_complexity = 0
if isinstance(
node,
(
ast.If,
ast.While,
ast.For,
ast.AsyncFor,
ast.ExceptHandler,
ast.With,
),
):
local_complexity += 1 + depth
elif isinstance(node, ast.BoolOp):
# Logical operators add complexity
local_complexity += len(node.values) - 1
elif (
isinstance(node, ast.Lambda)
or isinstance(node, ast.Expr)
and isinstance(node.value, ast.IfExp)
):
local_complexity += 1
complexity += local_complexity
# Increase nesting for control structures
new_depth = (
depth + 1
if isinstance(
node,
(
ast.If,
ast.While,
ast.For,
ast.AsyncFor,
ast.ExceptHandler,
ast.With,
),
)
else depth
)
# Recursively visit children
for child in ast.iter_child_nodes(node):
visit_node(child, new_depth)
return complexity
visit_node(tree)
return complexity
def _calculate_nesting_metrics(self, tree: ast.AST) -> tuple[int, float]:
"""Calculate nesting depth metrics."""
depths = []
def visit_node(node: ast.AST, depth: int = 0) -> None:
current_depth = depth
if isinstance(
node, (ast.If, ast.While, ast.For, ast.AsyncFor, ast.With, ast.Try)
):
current_depth += 1
depths.append(current_depth)
for child in ast.iter_child_nodes(node):
visit_node(child, current_depth)
visit_node(tree)
max_depth = max(depths) if depths else 0
avg_depth = sum(depths) / len(depths) if depths else 0.0
return max_depth, round(avg_depth, 2)
def _calculate_halstead_metrics(self, tree: ast.AST) -> dict[str, float]:
"""Calculate Halstead complexity metrics."""
operators = Counter()
operands = Counter()
for node in ast.walk(tree):
# Operators
if isinstance(node, (ast.BinOp, ast.UnaryOp)):
operators[type(node.op).__name__] += 1
elif isinstance(node, ast.Compare):
for op in node.ops:
operators[type(op).__name__] += 1
elif isinstance(node, ast.BoolOp):
operators[type(node.op).__name__] += 1
elif isinstance(node, (ast.If, ast.While, ast.For, ast.AsyncFor)):
operators["control"] += 1
elif isinstance(node, ast.Call):
operators["call"] += 1
elif isinstance(node, (ast.Assign, ast.AugAssign)):
operators["assign"] += 1
# Operands
if isinstance(node, ast.Name):
operands[node.id] += 1
elif isinstance(node, ast.Constant):
operands[str(node.value)] += 1
elif isinstance(node, ast.Attribute):
operands[node.attr] += 1
# Halstead metrics
n1 = len(operators) # Number of unique operators
n2 = len(operands) # Number of unique operands
N1 = sum(operators.values()) # Total operators
N2 = sum(operands.values()) # Total operands
vocabulary = n1 + n2
length = N1 + N2
if n2 == 0:
return {
"difficulty": 0.0,
"volume": 0.0,
"effort": 0.0,
"time": 0.0,
"bugs": 0.0,
}
# Prevent division by zero and invalid log
if vocabulary <= 1:
volume = 0.0
else:
import math
volume = length * math.log2(vocabulary)
difficulty = (n1 / 2) * (N2 / n2) if n2 > 0 else 0.0
effort = difficulty * volume
time = effort / 18 # Seconds
bugs = volume / 3000 # Delivered bugs estimation
return {
"difficulty": round(difficulty, 2),
"volume": round(volume, 2),
"effort": round(effort, 2),
"time": round(time, 2),
"bugs": round(bugs, 4),
}
def _calculate_maintainability_index(self, metrics: ComplexityMetrics) -> float:
"""Calculate maintainability index."""
import math
# Original Microsoft formula adapted
# MI = 171 - 5.2 * ln(HV) - 0.23 * CC - 16.2 * ln(LOC)
# Where HV = Halstead Volume, CC = Cyclomatic Complexity, LOC = Lines of Code
if metrics.halstead_volume <= 0 or metrics.source_lines_of_code <= 0:
return 100.0 # Default high maintainability for simple code
try:
mi: float = (
171
- 5.2 * math.log(metrics.halstead_volume)
- 0.23 * metrics.cyclomatic_complexity
- 16.2 * math.log(metrics.source_lines_of_code)
)
# Normalize to 0-100 scale
mi = max(0.0, min(100.0, mi))
return round(mi, 2)
except (ValueError, ZeroDivisionError):
return 50.0 # Default moderate maintainability
def _count_logical_lines(self, tree: ast.AST) -> int:
"""Count logical lines of code (AST nodes that represent statements)."""
count = 0
for node in ast.walk(tree):
if isinstance(
node,
ast.Assign
| ast.AugAssign
| ast.Return
| ast.Yield
| ast.YieldFrom
| ast.Expr
| ast.Import
| ast.ImportFrom
| ast.Pass
| ast.Break
| ast.Continue
| ast.Global
| ast.Nonlocal
| ast.Assert,
):
count += 1
return count
def _count_variables(self, tree: ast.AST) -> int:
"""Count unique variable names."""
variables = set()
for node in ast.walk(tree):
if isinstance(node, ast.Name) and isinstance(
node.ctx, (ast.Store, ast.Del)
):
variables.add(node.id)
return len(variables)
def _count_methods(self, tree: ast.AST) -> int:
"""Count methods inside classes."""
method_count = 0
for node in ast.walk(tree):
if isinstance(node, ast.ClassDef):
for child in node.body:
if isinstance(child, (ast.FunctionDef, ast.AsyncFunctionDef)):
method_count += 1
return method_count

View File

@@ -0,0 +1,186 @@
"""Complexity metrics data structures and calculations."""
from dataclasses import dataclass
from typing import Any
@dataclass
class ComplexityMetrics:
"""Container for various complexity metrics."""
# Cyclomatic complexity
cyclomatic_complexity: int = 0
# Cognitive complexity
cognitive_complexity: int = 0
# Halstead metrics
halstead_difficulty: float = 0.0
halstead_effort: float = 0.0
halstead_volume: float = 0.0
halstead_time: float = 0.0
halstead_bugs: float = 0.0
# Maintainability index
maintainability_index: float = 0.0
# Raw metrics
lines_of_code: int = 0
source_lines_of_code: int = 0
logical_lines_of_code: int = 0
comment_lines: int = 0
blank_lines: int = 0
# Function/class counts
function_count: int = 0
class_count: int = 0
method_count: int = 0
# Nesting and depth metrics
max_nesting_depth: int = 0
average_nesting_depth: float = 0.0
# Additional metrics
parameters_count: int = 0
variables_count: int = 0
returns_count: int = 0
def to_dict(self) -> dict[str, Any]:
"""Convert to dictionary representation."""
return {
"cyclomatic_complexity": self.cyclomatic_complexity,
"cognitive_complexity": self.cognitive_complexity,
"halstead_difficulty": self.halstead_difficulty,
"halstead_effort": self.halstead_effort,
"halstead_volume": self.halstead_volume,
"halstead_time": self.halstead_time,
"halstead_bugs": self.halstead_bugs,
"maintainability_index": self.maintainability_index,
"lines_of_code": self.lines_of_code,
"source_lines_of_code": self.source_lines_of_code,
"logical_lines_of_code": self.logical_lines_of_code,
"comment_lines": self.comment_lines,
"blank_lines": self.blank_lines,
"function_count": self.function_count,
"class_count": self.class_count,
"method_count": self.method_count,
"max_nesting_depth": self.max_nesting_depth,
"average_nesting_depth": self.average_nesting_depth,
"parameters_count": self.parameters_count,
"variables_count": self.variables_count,
"returns_count": self.returns_count,
}
@classmethod
def from_dict(cls, data: dict[str, Any]) -> "ComplexityMetrics":
"""Create from dictionary representation."""
return cls(**data)
def get_overall_score(self) -> float:
"""Calculate overall complexity score (0-100, lower is better)."""
# Weighted combination of different metrics
# Higher weights for more important complexity indicators
score = 0.0
# Cyclomatic complexity (weight: 30%)
cyclomatic_score = min(self.cyclomatic_complexity * 2, 100)
score += cyclomatic_score * 0.3
# Cognitive complexity (weight: 30%)
cognitive_score = min(self.cognitive_complexity * 2, 100)
score += cognitive_score * 0.3
# Maintainability index (weight: 20%, inverted since higher is better)
maintainability_score = max(100 - self.maintainability_index, 0)
score += maintainability_score * 0.2
# Nesting depth (weight: 10%)
nesting_score = min(self.max_nesting_depth * 10, 100)
score += nesting_score * 0.1
# Halstead difficulty (weight: 10%)
halstead_score = min(self.halstead_difficulty * 3, 100)
score += halstead_score * 0.1
return round(score, 2)
def get_complexity_level(self) -> str:
"""Get human-readable complexity level."""
score = self.get_overall_score()
if score < 20:
return "Low"
elif score < 40:
return "Moderate"
elif score < 60:
return "High"
elif score < 80:
return "Very High"
else:
return "Extreme"
def get_priority_score(self) -> float:
"""Get priority score for refactoring (0-1, higher means higher priority)."""
overall_score = self.get_overall_score()
# Convert to 0-1 scale
priority = overall_score / 100.0
# Boost priority for extreme cases
if self.cyclomatic_complexity > 20:
priority = min(priority + 0.2, 1.0)
if self.cognitive_complexity > 25:
priority = min(priority + 0.2, 1.0)
if self.max_nesting_depth > 5:
priority = min(priority + 0.1, 1.0)
return round(priority, 3)
def get_recommendations(self) -> list[str]:
"""Get complexity reduction recommendations."""
recommendations = []
if self.cyclomatic_complexity > 10:
recommendations.append(
f"High cyclomatic complexity ({self.cyclomatic_complexity}). "
"Consider breaking down complex conditional logic."
)
if self.cognitive_complexity > 15:
recommendations.append(
f"High cognitive complexity ({self.cognitive_complexity}). "
"Consider extracting nested logic into separate methods."
)
if self.max_nesting_depth > 4:
recommendations.append(
f"Deep nesting detected ({self.max_nesting_depth} levels). "
"Consider using guard clauses or early returns."
)
if self.maintainability_index < 20:
recommendations.append(
f"Low maintainability index ({self.maintainability_index:.1f}). "
"Consider refactoring for better readability and simplicity."
)
if self.halstead_difficulty > 20:
recommendations.append(
f"High Halstead difficulty ({self.halstead_difficulty:.1f}). "
"Code may be hard to understand and maintain."
)
if self.function_count == 0 and self.lines_of_code > 50:
recommendations.append(
"Large code block without functions. "
"Consider extracting reusable functions."
)
if self.parameters_count > 5:
recommendations.append(
f"Many parameters ({self.parameters_count}). "
"Consider using parameter objects or configuration classes."
)
return recommendations

View File

@@ -0,0 +1,348 @@
"""Radon integration for professional complexity analysis."""
import ast
from pathlib import Path
from typing import Any
try:
from radon.complexity import cc_rank, cc_visit
from radon.metrics import h_visit, mi_visit
from radon.raw import analyze
RADON_AVAILABLE = True
except ImportError:
RADON_AVAILABLE = False
from .calculator import ComplexityCalculator
from .metrics import ComplexityMetrics
class RadonComplexityAnalyzer:
"""Professional complexity analyzer using Radon library."""
def __init__(self, fallback_to_manual: bool = True):
self.fallback_to_manual = fallback_to_manual
self.manual_calculator = ComplexityCalculator()
def analyze_code(self, code: str, filename: str = "<string>") -> ComplexityMetrics:
"""Analyze code complexity using Radon or fallback to manual calculation."""
if RADON_AVAILABLE:
return self._analyze_with_radon(code, filename)
elif self.fallback_to_manual:
return self.manual_calculator.calculate_complexity(code)
else:
raise ImportError("Radon is not available and fallback is disabled")
def analyze_file(self, file_path: Path) -> ComplexityMetrics:
"""Analyze complexity of a file."""
try:
with open(file_path, encoding="utf-8") as f:
code = f.read()
return self.analyze_code(code, str(file_path))
except Exception:
# Return empty metrics for unreadable files
return ComplexityMetrics()
def _analyze_with_radon(self, code: str, filename: str) -> ComplexityMetrics:
"""Analyze code using Radon library."""
metrics = ComplexityMetrics()
try:
# Raw metrics (lines of code, etc.)
raw_metrics = analyze(code)
if raw_metrics:
metrics.lines_of_code = raw_metrics.loc
metrics.logical_lines_of_code = raw_metrics.lloc
metrics.source_lines_of_code = raw_metrics.sloc
metrics.comment_lines = raw_metrics.comments
metrics.blank_lines = raw_metrics.blank
# Cyclomatic complexity
cc_results = cc_visit(code)
if cc_results:
# Sum up complexity from all functions/methods
total_complexity = sum(block.complexity for block in cc_results)
metrics.cyclomatic_complexity = total_complexity
# Count functions and classes
metrics.function_count = len(
[b for b in cc_results if b.is_method or b.type == "function"]
)
metrics.class_count = len([b for b in cc_results if b.type == "class"])
metrics.method_count = len([b for b in cc_results if b.is_method])
# Halstead metrics
try:
halstead_data = h_visit(code)
if halstead_data:
metrics.halstead_difficulty = halstead_data.difficulty
metrics.halstead_effort = halstead_data.effort
metrics.halstead_volume = halstead_data.volume
metrics.halstead_time = halstead_data.time
metrics.halstead_bugs = halstead_data.bugs
except Exception:
# Halstead calculation can fail for some code patterns
pass
# Maintainability Index
try:
mi_data = mi_visit(code, multi=True)
if mi_data and hasattr(mi_data, "mi"):
metrics.maintainability_index = mi_data.mi
except Exception:
# MI calculation can fail, calculate manually
metrics.maintainability_index = self._calculate_mi_fallback(metrics)
# Calculate additional metrics manually
metrics = self._enhance_with_manual_metrics(code, metrics)
except Exception:
# If Radon fails completely, fallback to manual calculation
if self.fallback_to_manual:
return self.manual_calculator.calculate_complexity(code)
else:
raise
return metrics
def _enhance_with_manual_metrics(
self, code: str, metrics: ComplexityMetrics
) -> ComplexityMetrics:
"""Add metrics not provided by Radon using manual calculation."""
import ast
try:
tree = ast.parse(code)
# Calculate cognitive complexity manually
metrics.cognitive_complexity = self._calculate_cognitive_complexity(tree)
# Calculate nesting metrics
max_depth, avg_depth = self._calculate_nesting_metrics(tree)
metrics.max_nesting_depth = max_depth
metrics.average_nesting_depth = avg_depth
# Count variables, parameters, returns
for node in ast.walk(tree):
if isinstance(node, ast.FunctionDef | ast.AsyncFunctionDef):
metrics.parameters_count += len(node.args.args)
metrics.returns_count += len(
[n for n in ast.walk(node) if isinstance(n, ast.Return)]
)
# Count variables
variables = set()
for node in ast.walk(tree):
if isinstance(node, ast.Name) and isinstance(
node.ctx, ast.Store | ast.Del
):
variables.add(node.id)
metrics.variables_count = len(variables)
except SyntaxError:
# If AST parsing fails, keep existing metrics
pass
return metrics
def _calculate_cognitive_complexity(self, tree: ast.AST) -> int:
"""Calculate cognitive complexity manually."""
complexity = 0
def visit_node(node: ast.AST, depth: int = 0) -> None:
nonlocal complexity
local_complexity = 0
if isinstance(
node,
ast.If
| ast.While
| ast.For
| ast.AsyncFor
| ast.ExceptHandler
| ast.With,
):
local_complexity += 1 + depth
elif isinstance(node, ast.BoolOp):
local_complexity += len(node.values) - 1
elif (
isinstance(node, ast.Lambda)
or isinstance(node, ast.Expr)
and isinstance(node.value, ast.IfExp)
):
local_complexity += 1
complexity += local_complexity
# Increase nesting for control structures
new_depth = (
depth + 1
if isinstance(
node,
ast.If
| ast.While
| ast.For
| ast.AsyncFor
| ast.ExceptHandler
| ast.With,
)
else depth
)
for child in ast.iter_child_nodes(node):
visit_node(child, new_depth)
visit_node(tree)
return complexity
def _calculate_nesting_metrics(self, tree: ast.AST) -> tuple[int, float]:
"""Calculate nesting depth metrics."""
depths = []
def visit_node(node: ast.AST, depth: int = 0) -> None:
current_depth = depth
if isinstance(
node, ast.If | ast.While | ast.For | ast.AsyncFor | ast.With | ast.Try
):
current_depth += 1
depths.append(current_depth)
for child in ast.iter_child_nodes(node):
visit_node(child, current_depth)
visit_node(tree)
max_depth = max(depths) if depths else 0
avg_depth = sum(depths) / len(depths) if depths else 0.0
return max_depth, round(avg_depth, 2)
def _calculate_mi_fallback(self, metrics: ComplexityMetrics) -> float:
"""Calculate maintainability index when Radon fails."""
import math
if metrics.halstead_volume <= 0 or metrics.source_lines_of_code <= 0:
return 100.0
try:
mi = (
171
- 5.2 * math.log(metrics.halstead_volume)
- 0.23 * metrics.cyclomatic_complexity
- 16.2 * math.log(metrics.source_lines_of_code)
)
return max(0, min(100, round(mi, 2)))
except (ValueError, ZeroDivisionError):
return 50.0
def get_complexity_rank(self, complexity_score: int) -> str:
"""Get complexity rank using Radon's ranking system."""
if not RADON_AVAILABLE:
# Manual ranking
if complexity_score <= 5:
return "A" # Low
elif complexity_score <= 10:
return "B" # Moderate
elif complexity_score <= 20:
return "C" # High
elif complexity_score <= 30:
return "D" # Very High
else:
return "F" # Extreme
return cc_rank(complexity_score)
def batch_analyze_files(
self, file_paths: list[Path], max_workers: int | None = None
) -> dict[Path, ComplexityMetrics]:
"""Analyze multiple files in parallel."""
import concurrent.futures
import os
if max_workers is None:
max_workers = os.cpu_count() or 4
results = {}
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
# Submit all tasks
future_to_path = {
executor.submit(self.analyze_file, path): path for path in file_paths
}
# Collect results
for future in concurrent.futures.as_completed(future_to_path):
path = future_to_path[future]
try:
results[path] = future.result()
except Exception:
# Create empty metrics for failed files
results[path] = ComplexityMetrics()
return results
def get_detailed_complexity_report(
self, code: str, filename: str = "<string>"
) -> dict[str, Any]:
"""Get detailed complexity report including function-level analysis."""
if not RADON_AVAILABLE:
metrics = self.manual_calculator.calculate_complexity(code)
return {
"file_metrics": metrics.to_dict(),
"functions": [],
"classes": [],
"radon_available": False,
}
metrics = self._analyze_with_radon(code, filename)
# Get function-level details from Radon
functions = []
classes = []
try:
cc_results = cc_visit(code)
for block in cc_results:
item = {
"name": block.name,
"complexity": block.complexity,
"rank": self.get_complexity_rank(block.complexity),
"line_number": block.lineno,
"end_line": getattr(block, "endline", None),
"type": block.type,
"is_method": getattr(block, "is_method", False),
}
if block.type == "function" or getattr(block, "is_method", False):
functions.append(item)
elif block.type == "class":
classes.append(item)
except Exception:
pass
return {
"file_metrics": metrics.to_dict(),
"functions": functions,
"classes": classes,
"radon_available": True,
}
@staticmethod
def is_available() -> bool:
"""Check if Radon is available."""
return RADON_AVAILABLE
@staticmethod
def get_radon_version() -> str | None:
"""Get Radon version if available."""
if not RADON_AVAILABLE:
return None
try:
import radon
return getattr(radon, "__version__", "unknown")
except AttributeError:
return "unknown"

View File

@@ -0,0 +1,5 @@
"""Configuration management for code quality analysis."""
from .schemas import QualityConfig, load_config
__all__ = ["QualityConfig", "load_config"]

View File

@@ -0,0 +1,128 @@
# Default configuration for IntelliKit Quality Analysis
# Detection settings
detection:
min_lines: 5 # Minimum lines for duplicate detection
min_tokens: 50 # Minimum tokens for duplicate detection
similarity_threshold: 0.8 # Similarity threshold (0.0-1.0)
# Similarity algorithm weights (should sum to ~1.0) - Optimized for better accuracy
similarity_algorithms:
- name: structural
weight: 0.5
enabled: true
- name: cosine
weight: 0.2
enabled: true
- name: jaccard
weight: 0.15
enabled: true
- name: levenshtein
weight: 0.1
enabled: true
- name: semantic
weight: 0.05
enabled: true
# Performance settings - Optimized LSH parameters
use_lsh: true # Use LSH for large codebases
lsh_threshold: 500 # Use LSH when blocks > this number (reduced for better coverage)
lsh_bands: 20 # Number of LSH bands (increased for better precision)
lsh_rows: 4 # Rows per band (decreased to balance precision/recall)
lsh_num_perm: 256 # Number of permutations (increased for better accuracy)
parallel_processing: true # Enable parallel processing
max_workers: null # Auto-detect CPU cores
# Complexity analysis
complexity:
include_cyclomatic: true # Include McCabe complexity
include_cognitive: true # Include cognitive complexity
include_halstead: true # Include Halstead metrics
include_maintainability: true # Include maintainability index
complexity_threshold: 10 # Threshold for flagging complex code
# Language support
languages:
languages:
- python
- javascript
- typescript
file_extensions:
python: [".py", ".pyx", ".pyi"]
javascript: [".js", ".jsx", ".es6", ".mjs"]
typescript: [".ts", ".tsx"]
# File path configuration
paths:
include_patterns:
- "**/*.py"
- "**/*.js"
- "**/*.ts"
exclude_patterns:
- "**/__pycache__/**"
- "**/*.pyc"
- "**/venv/**"
- "**/.venv/**"
- "**/node_modules/**"
- "**/.git/**"
- "**/build/**"
- "**/dist/**"
- "**/migrations/**"
max_files: null # No limit
follow_symlinks: false
# Refactoring suggestions
refactoring:
enabled: true
min_priority_score: 1.0 # Minimum priority for suggestions
suggest_extract_method: true
suggest_extract_class: true
suggest_parameter_object: true
suggest_template_method: true
estimate_effort: true # Include effort estimates
risk_threshold: 0.7 # Risk threshold for suggestions
# Reporting configuration
reporting:
formats: ["console"] # Output formats
output_dir: "./quality_reports"
# Console output settings
show_code_preview: true
show_complexity_metrics: true
show_refactoring_suggestions: true
# Dashboard settings
dashboard_enabled: false
dashboard_port: 8080
dashboard_host: "localhost"
# Export formats
export_sarif: false # IDE integration format
export_json: false
export_html: false
export_csv: false
# Cache configuration
cache:
enabled: true
cache_dir: ".quality_cache"
max_age_days: 7 # Cache expiry in days
use_memory_cache: true
# External integrations
integrations:
# Git integration
use_git: true
analyze_git_history: false # Analyze historical changes
blame_duplicates: false # Show git blame for duplicates
# JSCPD for multi-language support
use_jscpd: true
jscpd_path: null # Auto-detect jscpd path
jscpd_config: {} # Additional jscpd options
# Global settings
version: "1.0.0"
debug: false
verbose: false

View File

@@ -0,0 +1,294 @@
"""Configuration schemas using Pydantic."""
from pathlib import Path
import yaml
from pydantic import BaseModel, field_validator
try:
from pydantic import Field
except ImportError:
from pydantic.v1 import Field
class SimilarityAlgorithmConfig(BaseModel):
"""Configuration for similarity algorithms."""
name: str
weight: float = Field(default=1.0, ge=0.0, le=1.0)
enabled: bool = True
parameters: dict[str, str | int | float | bool] = Field(default_factory=dict)
class ComplexityConfig(BaseModel):
"""Configuration for complexity analysis."""
include_cyclomatic: bool = True
include_cognitive: bool = True
include_halstead: bool = True
include_maintainability: bool = True
complexity_threshold: int = Field(default=10, ge=1)
class DetectionConfig(BaseModel):
"""Configuration for duplicate detection."""
min_lines: int = Field(default=5, ge=1)
min_tokens: int = Field(default=50, ge=1)
similarity_threshold: float = Field(default=0.8, ge=0.0, le=1.0)
# Similarity algorithms
similarity_algorithms: list[SimilarityAlgorithmConfig] = Field(
default_factory=lambda: [
SimilarityAlgorithmConfig(name="levenshtein", weight=0.2),
SimilarityAlgorithmConfig(name="jaccard", weight=0.3),
SimilarityAlgorithmConfig(name="cosine", weight=0.3),
SimilarityAlgorithmConfig(name="semantic", weight=0.2),
]
)
# Performance settings
use_lsh: bool = True
lsh_threshold: int = Field(
default=1000, ge=100
) # Use LSH for datasets larger than this
parallel_processing: bool = True
max_workers: int | None = None
class LanguageConfig(BaseModel):
"""Configuration for language support."""
languages: set[str] = Field(default_factory=lambda: {"python"})
file_extensions: dict[str, list[str]] = Field(
default_factory=lambda: {
"python": [".py", ".pyx", ".pyi"],
"javascript": [".js", ".jsx", ".es6", ".mjs"],
"typescript": [".ts", ".tsx"],
"java": [".java"],
"c": [".c", ".h"],
"cpp": [".cpp", ".cxx", ".cc", ".hpp", ".hxx"],
"csharp": [".cs"],
"go": [".go"],
"rust": [".rs"],
"php": [".php"],
"ruby": [".rb"],
}
)
class PathConfig(BaseModel):
"""Configuration for file paths."""
include_patterns: list[str] = Field(default_factory=lambda: ["**/*.py"])
exclude_patterns: list[str] = Field(
default_factory=lambda: [
"**/__pycache__/**",
"**/*.pyc",
"**/venv/**",
"**/.venv/**",
"**/node_modules/**",
"**/.git/**",
"**/build/**",
"**/dist/**",
]
)
max_files: int | None = None
follow_symlinks: bool = False
class RefactoringConfig(BaseModel):
"""Configuration for refactoring suggestions."""
enabled: bool = True
min_priority_score: float = Field(default=1.0, ge=0.0)
suggest_extract_method: bool = True
suggest_extract_class: bool = True
suggest_parameter_object: bool = True
suggest_template_method: bool = True
estimate_effort: bool = True
risk_threshold: float = Field(default=0.7, ge=0.0, le=1.0)
class ReportingConfig(BaseModel):
"""Configuration for reporting."""
formats: list[str] = Field(default_factory=lambda: ["console"])
output_dir: Path = Field(default=Path("./quality_reports"))
# Console reporting
show_code_preview: bool = True
show_complexity_metrics: bool = True
show_refactoring_suggestions: bool = True
# Dashboard settings
dashboard_enabled: bool = False
dashboard_port: int = Field(default=8080, ge=1024, le=65535)
dashboard_host: str = "localhost"
# Export formats
export_sarif: bool = False
export_json: bool = False
export_html: bool = False
export_csv: bool = False
class CacheConfig(BaseModel):
"""Configuration for caching."""
enabled: bool = True
cache_dir: Path = Field(default=Path(".quality_cache"))
max_age_days: int = Field(default=7, ge=1)
use_memory_cache: bool = True
class IntegrationConfig(BaseModel):
"""Configuration for external integrations."""
# Git integration
use_git: bool = True
analyze_git_history: bool = False
blame_duplicates: bool = False
# JSCPD integration for multi-language support
use_jscpd: bool = True
jscpd_path: str | None = None
jscpd_config: dict[str, str | int | float | bool] = Field(default_factory=dict)
class ExceptionRule(BaseModel):
"""Configuration for a single exception rule."""
analysis_type: str # "complexity", "duplicates", "modernization", "code_smells"
issue_type: str | None = None # Specific issue type (optional)
file_patterns: list[str] = Field(default_factory=list) # File path patterns
line_patterns: list[str] = Field(default_factory=list) # Line content patterns
reason: str | None = None # Optional reason for the exception
expires: str | None = None # Optional expiration date (YYYY-MM-DD)
enabled: bool = True
class ExceptionsConfig(BaseModel):
"""Configuration for analysis exceptions."""
enabled: bool = True
rules: list[ExceptionRule] = Field(default_factory=list)
# Global file/directory exceptions
exclude_files: list[str] = Field(default_factory=list)
exclude_directories: list[str] = Field(default_factory=list)
# Temporary suppressions (auto-expire)
temporary_suppressions: dict[str, str] = Field(
default_factory=dict
) # rule_id -> expiry_date
class QualityConfig(BaseModel):
"""Main configuration for code quality analysis."""
# Core configuration sections
detection: DetectionConfig = Field(default_factory=DetectionConfig)
complexity: ComplexityConfig = Field(default_factory=ComplexityConfig)
languages: LanguageConfig = Field(default_factory=LanguageConfig)
paths: PathConfig = Field(default_factory=PathConfig)
refactoring: RefactoringConfig = Field(default_factory=RefactoringConfig)
reporting: ReportingConfig = Field(default_factory=ReportingConfig)
cache: CacheConfig = Field(default_factory=CacheConfig)
integrations: IntegrationConfig = Field(default_factory=IntegrationConfig)
exceptions: ExceptionsConfig = Field(default_factory=ExceptionsConfig)
# Global settings
version: str = "1.0.0"
debug: bool = False
verbose: bool = False
@field_validator("detection")
def validate_similarity_weights(cls, v):
"""Ensure similarity algorithm weights sum to approximately 1.0."""
total_weight = sum(alg.weight for alg in v.similarity_algorithms if alg.enabled)
if abs(total_weight - 1.0) > 0.1:
# Auto-normalize weights
for alg in v.similarity_algorithms:
if alg.enabled:
alg.weight = alg.weight / total_weight
return v
class Config:
"""Pydantic configuration."""
validate_assignment = True
extra = "forbid"
def load_config(config_path: Path | None = None) -> QualityConfig:
"""Load configuration from file or use defaults."""
if config_path is None:
# Look for config files in common locations
possible_paths = [
Path("quality.yaml"),
Path("quality.yml"),
Path(".quality.yaml"),
Path(".quality.yml"),
Path("pyproject.toml"), # Look for [tool.quality] section
]
for path in possible_paths:
if path.exists():
config_path = path
break
if config_path and config_path.exists():
return _load_from_file(config_path)
else:
return QualityConfig()
def _load_from_file(config_path: Path) -> QualityConfig:
"""Load configuration from specific file."""
if config_path.suffix.lower() in [".yaml", ".yml"]:
return _load_from_yaml(config_path)
elif config_path.name == "pyproject.toml":
return _load_from_pyproject(config_path)
else:
raise ValueError(f"Unsupported config file format: {config_path}")
def _load_from_yaml(config_path: Path) -> QualityConfig:
"""Load configuration from YAML file."""
with open(config_path, encoding="utf-8") as f:
data = yaml.safe_load(f)
return QualityConfig(**data) if data else QualityConfig()
def _load_from_pyproject(config_path: Path) -> QualityConfig:
"""Load configuration from pyproject.toml file."""
try:
import tomli
except ImportError:
try:
import tomllib as tomli
except ImportError as e:
raise ImportError(
"tomli package required to read pyproject.toml. "
"Install with: pip install tomli"
) from e
with open(config_path, "rb") as f:
data = tomli.load(f)
# Extract quality configuration
quality_config = data.get("tool", {}).get("quality", {})
return QualityConfig(**quality_config) if quality_config else QualityConfig()
def save_config(config: QualityConfig, output_path: Path) -> None:
"""Save configuration to YAML file."""
with open(output_path, "w", encoding="utf-8") as f:
yaml.dump(
config.dict(exclude_defaults=True),
f,
default_flow_style=False,
sort_keys=True,
)

View File

@@ -0,0 +1,23 @@
"""Core components for code quality analysis."""
from .ast_analyzer import ASTAnalyzer
from .base import (
AnalysisResult,
CodeBlock,
ComplexityMetrics,
DuplicateMatch,
RefactoringSuggestion,
SimilarityAlgorithm,
)
from .cache import CacheManager
__all__ = [
"AnalysisResult",
"ASTAnalyzer",
"CacheManager",
"CodeBlock",
"ComplexityMetrics",
"DuplicateMatch",
"RefactoringSuggestion",
"SimilarityAlgorithm",
]

View File

@@ -0,0 +1,281 @@
"""Enhanced AST analysis for code quality detection."""
import ast
from .base import CodeBlock, ComplexityMetrics
class ASTAnalyzer(ast.NodeVisitor):
"""Enhanced AST visitor for extracting code structure and complexity metrics."""
def __init__(self, file_path: str = "", content: str = ""):
self.file_path = file_path
self.content_lines = content.splitlines() if content else []
self.functions: list[CodeBlock] = []
self.classes: list[CodeBlock] = []
self.code_blocks: list[CodeBlock] = []
self.imports: list[str] = []
self.global_variables: set[str] = set()
self.call_graph: dict[str, list[str]] = {}
def extract_code_blocks(self, file_path, min_lines: int = 5) -> list[CodeBlock]:
"""Extract code blocks from a file."""
try:
with open(file_path, encoding="utf-8") as f:
content = f.read()
except (OSError, UnicodeDecodeError):
return []
# Reset analyzer state
self.__init__(str(file_path), content)
try:
tree = ast.parse(content)
except SyntaxError:
return []
else:
self.visit(tree)
# Filter blocks by minimum size
filtered_blocks = []
for block in self.code_blocks:
if (block.end_line - block.start_line + 1) >= min_lines:
filtered_blocks.append(block)
return filtered_blocks
def visit_FunctionDef(self, node: ast.FunctionDef) -> None:
"""Visit function definitions with complexity analysis."""
complexity = self._calculate_cyclomatic_complexity(node)
cognitive_complexity = self._calculate_cognitive_complexity(node)
metrics = ComplexityMetrics(
cyclomatic_complexity=complexity, cognitive_complexity=cognitive_complexity
)
block = self._extract_code_block(node, node.name, "function", metrics)
self.functions.append(block)
self._extract_function_calls(node, node.name)
self.generic_visit(node)
def visit_AsyncFunctionDef(self, node: ast.AsyncFunctionDef) -> None:
"""Visit async function definitions."""
complexity = self._calculate_cyclomatic_complexity(node)
cognitive_complexity = self._calculate_cognitive_complexity(node)
metrics = ComplexityMetrics(
cyclomatic_complexity=complexity, cognitive_complexity=cognitive_complexity
)
block = self._extract_code_block(node, node.name, "function", metrics)
self.functions.append(block)
self._extract_function_calls(node, node.name)
self.generic_visit(node)
def visit_ClassDef(self, node: ast.ClassDef) -> None:
"""Visit class definitions."""
# Class complexity is sum of method complexities
methods = [
n
for n in ast.walk(node)
if isinstance(n, ast.FunctionDef | ast.AsyncFunctionDef)
]
total_complexity = sum(
self._calculate_cyclomatic_complexity(method) for method in methods
)
metrics = ComplexityMetrics(cyclomatic_complexity=total_complexity)
block = self._extract_code_block(node, node.name, "class", metrics)
self.classes.append(block)
self.generic_visit(node)
def visit_Import(self, node: ast.Import) -> None:
"""Track imports."""
for alias in node.names:
self.imports.append(alias.name)
self.generic_visit(node)
def visit_ImportFrom(self, node: ast.ImportFrom) -> None:
"""Track from imports."""
if node.module:
for alias in node.names:
self.imports.append(f"{node.module}.{alias.name}")
self.generic_visit(node)
def visit_Assign(self, node: ast.Assign) -> None:
"""Track global variable assignments."""
for target in node.targets:
if isinstance(target, ast.Name):
self.global_variables.add(target.id)
self.generic_visit(node)
def _extract_code_block(
self,
node: ast.AST,
name: str,
block_type: str,
complexity_metrics: ComplexityMetrics | None = None,
) -> CodeBlock:
"""Extract code block from AST node with enhanced metadata."""
start_line = node.lineno
end_line = getattr(node, "end_lineno", start_line)
if end_line is None:
end_line = start_line
content = "\n".join(self.content_lines[start_line - 1 : end_line])
block = CodeBlock(
file_path=self.file_path,
start_line=start_line,
end_line=end_line,
content=content,
complexity_metrics=complexity_metrics,
function_name=name if block_type == "function" else None,
class_name=name if block_type == "class" else None,
)
self.code_blocks.append(block)
return block
def _calculate_cyclomatic_complexity(self, node: ast.AST) -> int:
"""Calculate McCabe cyclomatic complexity."""
complexity = 1 # Base complexity
for child in ast.walk(node):
if isinstance(
child,
ast.If
| ast.While
| ast.For
| ast.AsyncFor
| ast.ExceptHandler
| ast.With
| ast.AsyncWith
| ast.Assert,
):
complexity += 1
elif isinstance(child, ast.BoolOp):
complexity += len(child.values) - 1
elif isinstance(child, ast.Break | ast.Continue):
complexity += 1
return complexity
def _calculate_cognitive_complexity(self, node: ast.AST) -> int:
"""Calculate cognitive complexity (more human-oriented)."""
complexity = 0
nesting_level = 0
def visit_node(n: ast.AST, level: int) -> int:
nonlocal complexity
local_complexity = 0
if isinstance(
n, ast.If | ast.While | ast.For | ast.AsyncFor | ast.ExceptHandler
):
local_complexity += 1 + level
elif isinstance(n, ast.Break | ast.Continue):
local_complexity += 1
elif isinstance(n, ast.BoolOp):
local_complexity += len(n.values) - 1
# Increase nesting for certain constructs
if isinstance(
n,
ast.If
| ast.While
| ast.For
| ast.AsyncFor
| ast.With
| ast.AsyncWith
| ast.Try,
):
level += 1
for child in ast.iter_child_nodes(n):
local_complexity += visit_node(child, level)
return local_complexity
return visit_node(node, nesting_level)
def _extract_function_calls(self, node: ast.AST, function_name: str) -> None:
"""Extract function calls to build call graph."""
calls = []
for child in ast.walk(node):
if isinstance(child, ast.Call):
if isinstance(child.func, ast.Name):
calls.append(child.func.id)
elif isinstance(child.func, ast.Attribute):
calls.append(child.func.attr)
self.call_graph[function_name] = calls
def get_code_structure_signature(self, node: ast.AST) -> str:
"""Generate structure signature for semantic comparison."""
structure_elements = []
for child in ast.walk(node):
if isinstance(child, ast.FunctionDef):
structure_elements.append(f"func:{len(child.args.args)}")
elif isinstance(child, ast.ClassDef):
structure_elements.append(f"class:{len(child.bases)}")
elif isinstance(child, ast.If):
structure_elements.append("if")
elif isinstance(child, ast.For):
structure_elements.append("for")
elif isinstance(child, ast.While):
structure_elements.append("while")
elif isinstance(child, ast.Try):
structure_elements.append("try")
return "|".join(structure_elements)
def get_variable_usage_pattern(self, node: ast.AST) -> dict[str, int]:
"""Analyze variable usage patterns."""
variable_usage = {}
for child in ast.walk(node):
if isinstance(child, ast.Name):
name = child.id
variable_usage[name] = variable_usage.get(name, 0) + 1
return variable_usage
def detect_code_smells(self) -> list[str]:
"""Detect common code smells."""
smells = []
# Long methods
long_methods = [f for f in self.functions if f.lines_count > 30]
if long_methods:
smells.append(
f"Long methods detected: {len(long_methods)} methods > 30 lines"
)
# Complex methods
complex_methods = [
f
for f in self.functions
if f.complexity_metrics and f.complexity_metrics.cyclomatic_complexity > 10
]
if complex_methods:
smells.append(
f"Complex methods detected: {len(complex_methods)} methods with complexity > 10"
)
# Many parameters
for func in self.functions:
try:
tree = ast.parse(func.content)
for node in ast.walk(tree):
if isinstance(node, ast.FunctionDef) and len(node.args.args) > 5:
smells.append(
f"Method with many parameters: {func.function_name} ({len(node.args.args)} parameters)"
)
except Exception:
pass
return smells

268
src/quality/core/base.py Normal file
View File

@@ -0,0 +1,268 @@
"""Base classes and interfaces for code quality analysis."""
import hashlib
from abc import ABC, abstractmethod
from dataclasses import dataclass, field
from enum import Enum
from pathlib import Path
from typing import Protocol
class SimilarityAlgorithm(Protocol):
"""Protocol for similarity calculation algorithms."""
def calculate(self, text1: str, text2: str) -> float:
"""Calculate similarity between two text strings.
Args:
text1: First text string
text2: Second text string
Returns:
Similarity score between 0.0 and 1.0
"""
...
class RefactoringType(Enum):
"""Types of refactoring suggestions."""
EXTRACT_METHOD = "extract_method"
EXTRACT_CLASS = "extract_class"
INTRODUCE_PARAMETER_OBJECT = "introduce_parameter_object"
TEMPLATE_METHOD = "template_method"
CONSOLIDATE_CONDITIONAL = "consolidate_conditional"
REMOVE_DUPLICATE = "remove_duplicate"
@dataclass
class ComplexityMetrics:
"""Code complexity metrics."""
cyclomatic_complexity: int
cognitive_complexity: int | None = None
halstead_difficulty: float | None = None
halstead_effort: float | None = None
maintainability_index: float | None = None
@property
def complexity_score(self) -> float:
"""Calculate overall complexity score."""
score = self.cyclomatic_complexity
if self.cognitive_complexity:
score += self.cognitive_complexity * 0.5
if self.halstead_difficulty:
score += self.halstead_difficulty * 0.3
return score
@dataclass
class CodeBlock:
"""Represents a block of code with metadata."""
file_path: str
start_line: int
end_line: int
content: str
content_hash: str = field(init=False)
normalized_content: str = field(init=False)
complexity_metrics: ComplexityMetrics | None = None
function_name: str | None = None
class_name: str | None = None
def __post_init__(self) -> None:
"""Initialize computed fields."""
self.content_hash = hashlib.md5(self.content.encode()).hexdigest()
self.normalized_content = self._normalize_content()
def _normalize_content(self) -> str:
"""Normalize content for comparison with enhanced identifier abstraction."""
import re
content = self.content
# Remove comments (Python, JavaScript, TypeScript)
content = re.sub(r"#.*$", "", content, flags=re.MULTILINE) # Python comments
content = re.sub(r"//.*$", "", content, flags=re.MULTILINE) # JS/TS single-line
content = re.sub(r"/\*.*?\*/", "", content, flags=re.DOTALL) # JS/TS multi-line
# Remove string literals but preserve their structure
content = re.sub(r'""".*?"""', '"""STRING"""', content, flags=re.DOTALL)
content = re.sub(r"'''.*?'''", "'''STRING'''", content, flags=re.DOTALL)
content = re.sub(r'"[^"]*"', '"STRING"', content)
content = re.sub(r"'[^']*'", "'STRING'", content)
# Normalize numeric literals
content = re.sub(r'\b\d+\.?\d*\b', 'NUM', content)
# Abstract variable names while preserving keywords and operators
python_keywords = {
'def', 'class', 'if', 'else', 'elif', 'for', 'while', 'try', 'except',
'finally', 'with', 'as', 'import', 'from', 'return', 'yield', 'pass',
'break', 'continue', 'and', 'or', 'not', 'in', 'is', 'lambda', 'None',
'True', 'False', 'self', 'cls'
}
# Split into tokens and normalize identifiers
tokens = re.findall(r'\b\w+\b|[^\w\s]', content)
normalized_tokens = []
for token in tokens:
if token.lower() in python_keywords or not re.match(r'^[a-zA-Z_]\w*$', token):
# Keep keywords and non-identifiers as-is
normalized_tokens.append(token)
else:
# Abstract user-defined identifiers
normalized_tokens.append('VAR')
content = ' '.join(normalized_tokens)
# Remove extra whitespace
content = re.sub(r"\s+", " ", content)
return content.strip()
@property
def lines_count(self) -> int:
"""Get number of lines in the code block."""
return self.end_line - self.start_line + 1
@property
def relative_path(self) -> str:
"""Get relative path from current working directory."""
try:
return str(Path(self.file_path).relative_to(Path.cwd()))
except ValueError:
return self.file_path
@dataclass
class RefactoringSuggestion:
"""Suggestion for refactoring duplicated code."""
refactoring_type: RefactoringType
description: str
affected_blocks: list[CodeBlock]
effort_estimate: float # Hours
risk_score: float # 0-1, higher = riskier
expected_benefit: str
implementation_steps: list[str] = field(default_factory=list)
@property
def priority_score(self) -> float:
"""Calculate priority based on benefit vs effort and risk."""
lines_saved = sum(block.lines_count for block in self.affected_blocks) - 1
complexity_reduction = sum(
block.complexity_metrics.complexity_score
for block in self.affected_blocks
if block.complexity_metrics
)
benefit_score = (lines_saved * 0.1) + (complexity_reduction * 0.5)
cost_score = self.effort_estimate + (self.risk_score * 2)
return benefit_score / max(cost_score, 0.1)
@dataclass
class DuplicateMatch:
"""Represents a duplicate code match."""
blocks: list[CodeBlock]
similarity_score: float
match_type: str # 'exact', 'similar', 'structural', 'semantic'
description: str
complexity_score: float = 0.0
priority_score: float = 0.0
refactoring_suggestion: RefactoringSuggestion | None = None
def __post_init__(self) -> None:
"""Calculate derived scores."""
if self.blocks:
# Average complexity score
complexity_scores = [
block.complexity_metrics.complexity_score
for block in self.blocks
if block.complexity_metrics
]
self.complexity_score = (
sum(complexity_scores) / len(complexity_scores)
if complexity_scores
else 0.0
)
# Calculate priority: similarity × complexity × lines
total_lines = sum(block.lines_count for block in self.blocks)
self.priority_score = (
self.similarity_score * self.complexity_score * (total_lines / 10)
)
@dataclass
class AnalysisResult:
"""Result of code quality analysis."""
duplicate_matches: list[DuplicateMatch]
total_files_analyzed: int
total_lines_analyzed: int
total_duplicated_lines: int
analysis_duration: float # seconds
summary_stats: dict[str, int | float] = field(default_factory=dict)
refactoring_suggestions: list[RefactoringSuggestion] = field(default_factory=list)
@property
def duplication_percentage(self) -> float:
"""Calculate percentage of duplicated lines."""
if self.total_lines_analyzed == 0:
return 0.0
return (self.total_duplicated_lines / self.total_lines_analyzed) * 100
@property
def high_priority_matches(self) -> list[DuplicateMatch]:
"""Get matches with high priority scores."""
return [match for match in self.duplicate_matches if match.priority_score > 5.0]
class CodeAnalyzer(ABC):
"""Abstract base class for code analyzers."""
@abstractmethod
def analyze(self, code: str, file_path: str) -> AnalysisResult:
"""Analyze code and return results."""
...
@abstractmethod
def supports_language(self, language: str) -> bool:
"""Check if analyzer supports the given language."""
...
class QualityMetricsCalculator(ABC):
"""Abstract base class for quality metrics calculation."""
@abstractmethod
def calculate_complexity(self, code: str) -> ComplexityMetrics:
"""Calculate complexity metrics for code."""
...
@abstractmethod
def calculate_maintainability_index(self, code: str) -> float:
"""Calculate maintainability index."""
...
@dataclass
class AnalysisConfig:
"""Configuration for analysis."""
min_lines: int = 5
min_tokens: int = 50
similarity_threshold: float = 0.8
complexity_threshold: int = 10
languages: set[str] = field(default_factory=lambda: {"python"})
exclude_patterns: list[str] = field(default_factory=list)
include_patterns: list[str] = field(default_factory=lambda: ["**/*.py"])
enable_semantic_analysis: bool = True
enable_refactoring_suggestions: bool = True
max_files: int | None = None
parallel_processing: bool = True
cache_enabled: bool = True

131
src/quality/core/cache.py Normal file
View File

@@ -0,0 +1,131 @@
"""Caching system for performance optimization."""
import hashlib
import pickle
from pathlib import Path
from typing import Any, Generic, TypeVar
from .base import CodeBlock
T = TypeVar("T")
class CacheManager(Generic[T]):
"""Generic cache manager for storing analysis results."""
def __init__(self, cache_dir: Path = Path(".quality_cache")):
self.cache_dir = cache_dir
self.cache_dir.mkdir(exist_ok=True)
self.memory_cache: dict[str, T] = {}
def _get_cache_key(self, data: str, prefix: str = "") -> str:
"""Generate cache key from data."""
hash_obj = hashlib.sha256(data.encode())
return f"{prefix}_{hash_obj.hexdigest()[:16]}"
def get(self, key: str, use_memory: bool = True) -> T | None:
"""Get item from cache."""
# Check memory cache first
if use_memory and key in self.memory_cache:
return self.memory_cache[key]
# Check disk cache
cache_file = self.cache_dir / f"{key}.pickle"
if cache_file.exists():
try:
with open(cache_file, "rb") as f:
data = pickle.load(f)
if use_memory:
self.memory_cache[key] = data
return data
except Exception:
# If cache is corrupted, remove it
cache_file.unlink(missing_ok=True)
return None
def set(self, key: str, value: T, use_memory: bool = True) -> None:
"""Store item in cache."""
if use_memory:
self.memory_cache[key] = value
# Store on disk
cache_file = self.cache_dir / f"{key}.pickle"
try:
with open(cache_file, "wb") as f:
pickle.dump(value, f)
except Exception:
pass # Fail silently if can't write to disk
def get_file_hash(self, file_path: Path) -> str:
"""Get hash of file contents and modification time."""
try:
stat = file_path.stat()
content_hash = hashlib.md5(file_path.read_bytes()).hexdigest()
except Exception:
return ""
else:
return f"{content_hash}_{stat.st_mtime}"
def is_file_cached(self, file_path: Path) -> bool:
"""Check if file analysis is cached and up-to-date."""
file_hash = self.get_file_hash(file_path)
if not file_hash:
return False
cache_key = self._get_cache_key(str(file_path), "file")
cached_result = self.get(f"{cache_key}_meta")
return cached_result == file_hash
def cache_file_analysis(self, file_path: Path, blocks: list[CodeBlock]) -> None:
"""Cache file analysis results."""
file_hash = self.get_file_hash(file_path)
cache_key = self._get_cache_key(str(file_path), "file")
# Cache the blocks
self.set(cache_key, blocks)
# Cache the file metadata
self.set(f"{cache_key}_meta", file_hash)
def get_cached_file_analysis(self, file_path: Path) -> list[CodeBlock] | None:
"""Get cached file analysis if up-to-date."""
if not self.is_file_cached(file_path):
return None
cache_key = self._get_cache_key(str(file_path), "file")
return self.get(cache_key)
def clear(self) -> None:
"""Clear all caches."""
self.memory_cache.clear()
# Clear disk cache
for cache_file in self.cache_dir.glob("*.pickle"):
cache_file.unlink(missing_ok=True)
def clear_old_entries(self, max_age_days: int = 7) -> None:
"""Clear cache entries older than specified days."""
import time
max_age_seconds = max_age_days * 24 * 3600
current_time = time.time()
for cache_file in self.cache_dir.glob("*.pickle"):
try:
if (current_time - cache_file.stat().st_mtime) > max_age_seconds:
cache_file.unlink()
except Exception:
pass
def get_cache_stats(self) -> dict[str, Any]:
"""Get cache statistics."""
disk_files = list(self.cache_dir.glob("*.pickle"))
total_size = sum(f.stat().st_size for f in disk_files if f.exists())
return {
"memory_items": len(self.memory_cache),
"disk_files": len(disk_files),
"total_size_mb": total_size / (1024 * 1024),
"cache_dir": str(self.cache_dir),
}

View File

@@ -0,0 +1,354 @@
"""Exception handling system for quality analysis."""
import fnmatch
import re
from datetime import datetime
from pathlib import Path
from typing import Any
from ..config.schemas import (
ExceptionRule,
QualityConfig,
)
class ExceptionFilter:
"""Filters analysis results based on configured exception rules."""
def __init__(self, config: QualityConfig):
self.config = config
self.exceptions_config = config.exceptions
self.active_rules = self._get_active_rules()
def _get_active_rules(self) -> list[ExceptionRule]:
"""Get currently active exception rules."""
if not self.exceptions_config.enabled:
return []
active_rules = []
current_date = datetime.now().date()
for rule in self.exceptions_config.rules:
if not rule.enabled:
continue
# Check if rule has expired
if rule.expires:
try:
expire_date = datetime.strptime(rule.expires, "%Y-%m-%d").date()
if current_date > expire_date:
continue
except ValueError:
# Invalid date format, skip rule
continue
active_rules.append(rule)
return active_rules
def should_suppress_issue(
self,
analysis_type: str,
issue_type: str | None,
file_path: str,
line_number: int,
line_content: str = "",
) -> tuple[bool, str | None]:
"""
Check if an issue should be suppressed.
Returns:
(should_suppress, reason)
"""
# Check global file/directory exclusions first
if self._is_globally_excluded(file_path):
return True, "File/directory globally excluded"
# Check exception rules
for rule in self.active_rules:
if self._rule_matches(
rule, analysis_type, issue_type, file_path, line_number, line_content
):
return (
True,
rule.reason or f"Matched exception rule: {rule.analysis_type}",
)
return False, None
def _is_globally_excluded(self, file_path: str) -> bool:
"""Check if file is globally excluded."""
normalized_path = str(Path(file_path).resolve())
# Check excluded files
for pattern in self.exceptions_config.exclude_files:
if fnmatch.fnmatch(normalized_path, pattern) or fnmatch.fnmatch(
file_path, pattern
):
return True
# Check excluded directories
for pattern in self.exceptions_config.exclude_directories:
if fnmatch.fnmatch(str(Path(file_path).parent), pattern):
return True
# Also check if any parent directory matches
path_parts = Path(file_path).parts
for i in range(len(path_parts)):
partial_path = "/".join(path_parts[: i + 1])
if fnmatch.fnmatch(partial_path, pattern):
return True
return False
def _rule_matches(
self,
rule: ExceptionRule,
analysis_type: str,
issue_type: str | None,
file_path: str,
line_number: int,
line_content: str,
) -> bool:
"""Check if a rule matches the current issue."""
# Check analysis type
if rule.analysis_type != analysis_type and rule.analysis_type != "*":
return False
# Check issue type if specified
if rule.issue_type and rule.issue_type != issue_type:
return False
# Check file patterns
if rule.file_patterns:
file_matches = False
for pattern in rule.file_patterns:
if fnmatch.fnmatch(file_path, pattern) or fnmatch.fnmatch(
str(Path(file_path).name), pattern
):
file_matches = True
break
if not file_matches:
return False
# Check line patterns
if rule.line_patterns and line_content:
line_matches = False
for pattern in rule.line_patterns:
if re.search(pattern, line_content):
line_matches = True
break
if not line_matches:
return False
return True
def filter_issues(
self,
analysis_type: str,
issues: list[Any],
get_file_path_fn: callable = None,
get_line_number_fn: callable = None,
get_line_content_fn: callable = None,
get_issue_type_fn: callable = None,
) -> list[Any]:
"""
Filter a list of issues based on exception rules.
Args:
analysis_type: Type of analysis ("complexity", "duplicates", etc.)
issues: List of issues to filter
get_file_path_fn: Function to extract file path from issue
get_line_number_fn: Function to extract line number from issue
get_line_content_fn: Function to extract line content from issue
get_issue_type_fn: Function to extract issue type from issue
"""
if not self.exceptions_config.enabled or not issues:
return issues
filtered_issues = []
for issue in issues:
# Extract issue details
file_path = (
get_file_path_fn(issue)
if get_file_path_fn
else getattr(issue, "file_path", "")
)
line_number = (
get_line_number_fn(issue)
if get_line_number_fn
else getattr(issue, "line_number", 0)
)
line_content = (
get_line_content_fn(issue)
if get_line_content_fn
else getattr(issue, "line_content", "")
)
issue_type = (
get_issue_type_fn(issue)
if get_issue_type_fn
else getattr(issue, "issue_type", None)
)
should_suppress, reason = self.should_suppress_issue(
analysis_type, issue_type, file_path, line_number, line_content
)
if not should_suppress:
filtered_issues.append(issue)
elif self.config.debug:
print(
f"Suppressed {analysis_type} issue in {file_path}:{line_number} - {reason}"
)
return filtered_issues
def get_suppression_summary(self) -> dict[str, Any]:
"""Get summary of active suppressions."""
return {
"enabled": self.exceptions_config.enabled,
"active_rules": len(self.active_rules),
"global_exclusions": {
"files": len(self.exceptions_config.exclude_files),
"directories": len(self.exceptions_config.exclude_directories),
},
"rules_by_type": self._summarize_rules_by_type(),
}
def _summarize_rules_by_type(self) -> dict[str, int]:
"""Summarize rules by analysis type."""
summary = {}
for rule in self.active_rules:
analysis_type = rule.analysis_type
summary[analysis_type] = summary.get(analysis_type, 0) + 1
return summary
def create_example_exceptions_config() -> dict[str, Any]:
"""Create an example exceptions configuration."""
return {
"exceptions": {
"enabled": True,
"exclude_files": [
"*/tests/*",
"*/test_*",
"*/__pycache__/*",
"*/migrations/*",
],
"exclude_directories": [
"*/venv/*",
"*/.venv/*",
"*/node_modules/*",
"*/.git/*",
],
"rules": [
{
"analysis_type": "complexity",
"issue_type": "high_complexity",
"file_patterns": ["*/legacy/*", "*/third_party/*"],
"reason": "Legacy code with known complexity - migration planned",
},
{
"analysis_type": "modernization",
"issue_type": "pydantic_v1_pattern",
"file_patterns": ["*/compatibility/*"],
"line_patterns": ["# pydantic v1 required", "# TODO: migrate"],
"reason": "Intentional Pydantic v1 usage for compatibility",
},
{
"analysis_type": "modernization",
"issue_type": "legacy_typing_import",
"file_patterns": ["*/external/*"],
"reason": "External library compatibility requirements",
},
{
"analysis_type": "duplicates",
"file_patterns": ["*/templates/*", "*/generated/*"],
"reason": "Generated or template code - duplication expected",
},
{
"analysis_type": "code_smells",
"issue_type": "long_method",
"file_patterns": ["*/parsers/*"],
"reason": "Parser methods intentionally long for readability",
"expires": "2024-12-31",
},
],
}
}
def create_exceptions_config_template() -> str:
"""Create a YAML template for exceptions configuration."""
return """# Quality Analysis Exceptions Configuration
# This file allows you to suppress specific analysis results
exceptions:
enabled: true
# Global file and directory exclusions
exclude_files:
- "*/tests/*"
- "*/test_*"
- "*/__pycache__/*"
- "*/migrations/*"
exclude_directories:
- "*/venv/*"
- "*/.venv/*"
- "*/node_modules/*"
- "*/.git/*"
# Specific exception rules
rules:
# Example: Suppress complexity issues in legacy code
- analysis_type: "complexity"
issue_type: "high_complexity"
file_patterns:
- "*/legacy/*"
- "*/third_party/*"
reason: "Legacy code with known complexity - migration planned"
# Example: Allow intentional Pydantic v1 usage
- analysis_type: "modernization"
issue_type: "pydantic_v1_pattern"
file_patterns:
- "*/compatibility/*"
line_patterns:
- "# pydantic v1 required"
- "# TODO: migrate"
reason: "Intentional Pydantic v1 usage for compatibility"
# Example: Suppress typing imports for external compatibility
- analysis_type: "modernization"
issue_type: "legacy_typing_import"
file_patterns:
- "*/external/*"
reason: "External library compatibility requirements"
# Example: Allow duplicates in generated/template code
- analysis_type: "duplicates"
file_patterns:
- "*/templates/*"
- "*/generated/*"
reason: "Generated or template code - duplication expected"
# Example: Temporary suppression with expiration
- analysis_type: "code_smells"
issue_type: "long_method"
file_patterns:
- "*/parsers/*"
reason: "Parser methods intentionally long for readability"
expires: "2024-12-31"
enabled: true
# Analysis Types:
# - "complexity" - Code complexity issues
# - "duplicates" - Duplicate code detection
# - "modernization" - Modern Python pattern suggestions
# - "code_smells" - General code smell detection
# Issue Types vary by analysis - see CLI output for specific types
# Use "*" for analysis_type to match all analysis types
# Leave issue_type empty to match all issues of that analysis type
"""

View File

@@ -0,0 +1,9 @@
"""Code duplicate detection engine."""
from .engine import DuplicateDetectionEngine
from .matcher import DuplicateMatcher
__all__ = [
"DuplicateDetectionEngine",
"DuplicateMatcher",
]

View File

@@ -0,0 +1,420 @@
"""Enhanced duplicate detection engine with multiple algorithms."""
import ast
from pathlib import Path
from typing import Any
from ..complexity.analyzer import ComplexityAnalyzer
from ..config.schemas import QualityConfig
from ..core.ast_analyzer import ASTAnalyzer
from ..core.base import CodeBlock, DuplicateMatch
from .matcher import DuplicateMatcher
from ..similarity.base import SimilarityCalculator
from ..similarity.lsh import LSHDuplicateDetector
class DuplicateDetectionEngine:
"""Advanced duplicate detection engine with configurable algorithms."""
def __init__(self, config: QualityConfig | None = None):
self.config = config or QualityConfig()
self.detection_config = self.config.detection
# Initialize exception filter
from ..core.exceptions import ExceptionFilter
self.exception_filter = ExceptionFilter(self.config)
# Initialize components
self.ast_analyzer = ASTAnalyzer()
self.complexity_analyzer = ComplexityAnalyzer(
self.config.complexity, self.config
)
self.similarity_calculator = self._create_similarity_calculator()
self.matcher = DuplicateMatcher(
self.similarity_calculator, self.detection_config
)
# LSH for large-scale detection
self.lsh_detector = None
if self.detection_config.use_lsh:
self.lsh_detector = LSHDuplicateDetector(
threshold=self.detection_config.similarity_threshold,
num_perm=128,
bands=16,
rows=8,
)
def detect_duplicates_in_files(
self, file_paths: list[Path], max_workers: int | None = None
) -> list[DuplicateMatch]:
"""Detect duplicates across multiple files."""
# Extract code blocks from all files
all_blocks = []
for file_path in file_paths:
try:
blocks = self.ast_analyzer.extract_code_blocks(file_path)
# Filter blocks by minimum size
filtered_blocks = [
block
for block in blocks
if (block.end_line - block.start_line + 1)
>= self.detection_config.min_lines
and len(block.content.split()) >= self.detection_config.min_tokens
]
all_blocks.extend(filtered_blocks)
except Exception:
# Skip files that can't be parsed
continue
return self.detect_duplicates_in_blocks(all_blocks)
def detect_duplicates_in_blocks(
self, blocks: list[CodeBlock]
) -> list[DuplicateMatch]:
"""Detect duplicates in a list of code blocks."""
if not blocks:
return []
# Use LSH for large datasets
if (
self.detection_config.use_lsh
and len(blocks) >= self.detection_config.lsh_threshold
and self.lsh_detector
):
return self._detect_with_lsh(blocks)
else:
return self._detect_with_similarity(blocks)
def find_duplicates_of_block(
self, target_block: CodeBlock, candidate_blocks: list[CodeBlock]
) -> list[DuplicateMatch]:
"""Find duplicates of a specific code block."""
matches = []
for candidate in candidate_blocks:
if candidate == target_block: # Skip self
continue
similarity = self.similarity_calculator.calculate_similarity(
target_block, candidate
)
if similarity >= self.detection_config.similarity_threshold:
# Calculate complexity metrics
target_complexity = self.complexity_analyzer.analyze_code(
target_block.content
)
candidate_complexity = self.complexity_analyzer.analyze_code(
candidate.content
)
match_type = "exact" if similarity >= 0.95 else "similar"
match = DuplicateMatch(
blocks=[target_block, candidate],
similarity_score=similarity,
match_type=match_type,
description=f"{match_type.title()} duplicate detected (similarity: {similarity:.3f})",
complexity_score=max(
target_complexity.get_overall_score(),
candidate_complexity.get_overall_score(),
),
priority_score=self._calculate_priority_score(
similarity,
target_complexity.get_overall_score(),
len([target_block, candidate]),
),
)
matches.append(match)
return matches
def get_detailed_analysis(self, duplicate_match: DuplicateMatch) -> dict[str, Any]:
"""Get detailed analysis of a duplicate match."""
if not duplicate_match.blocks:
return {}
# Analyze each block
block_analyses = []
for block in duplicate_match.blocks:
complexity = self.complexity_analyzer.analyze_code(block.content)
summary = self.complexity_analyzer.get_complexity_summary(complexity)
block_analyses.append(
{
"file_path": str(block.file_path),
"line_range": f"{block.start_line}-{block.end_line}",
"lines_of_code": block.end_line - block.start_line + 1,
"complexity": summary,
"content_preview": self._get_content_preview(block.content),
}
)
# Calculate similarity breakdown
similarity_breakdown = {}
if len(duplicate_match.blocks) >= 2:
similarity_breakdown = (
self.similarity_calculator.calculate_detailed_similarity(
duplicate_match.blocks[0], duplicate_match.blocks[1]
)
)
# Generate refactoring suggestions
suggestions = self._generate_refactoring_suggestions(duplicate_match)
return {
"match_info": {
"similarity_score": duplicate_match.similarity_score,
"match_type": duplicate_match.match_type,
"priority_score": duplicate_match.priority_score,
"complexity_score": duplicate_match.complexity_score,
},
"blocks": block_analyses,
"similarity_breakdown": similarity_breakdown,
"refactoring_suggestions": suggestions,
"estimated_effort": self._estimate_refactoring_effort(duplicate_match),
"risk_assessment": self._assess_refactoring_risk(duplicate_match),
}
def _create_similarity_calculator(self) -> SimilarityCalculator:
"""Create similarity calculator with configured algorithms."""
from ..similarity import (
CosineSimilarity,
JaccardSimilarity,
LevenshteinSimilarity,
SemanticSimilarity,
StructuralSimilarity,
)
algorithms = []
for algo_config in self.detection_config.similarity_algorithms:
if not algo_config.enabled:
continue
if algo_config.name == "levenshtein":
algorithms.append(LevenshteinSimilarity(algo_config))
elif algo_config.name == "jaccard":
algorithms.append(JaccardSimilarity(algo_config))
elif algo_config.name == "cosine":
algorithms.append(CosineSimilarity(algo_config))
elif algo_config.name == "semantic":
algorithms.append(SemanticSimilarity(algo_config))
elif algo_config.name == "structural":
algorithms.append(StructuralSimilarity(algo_config))
return SimilarityCalculator(algorithms)
def _detect_with_lsh(self, blocks: list[CodeBlock]) -> list[DuplicateMatch]:
"""Detect duplicates using LSH for performance."""
if not self.lsh_detector:
return []
# Add all blocks to LSH index
for block in blocks:
self.lsh_detector.add_code_block(block)
# Find duplicate groups
duplicate_groups = self.lsh_detector.find_all_duplicates()
# Convert to DuplicateMatch objects
matches = []
for group in duplicate_groups:
if len(group) < 2:
continue
# Calculate exact similarity for the group
representative = group[0]
similarities = []
for other in group[1:]:
similarity = self.similarity_calculator.calculate_similarity(
representative, other
)
similarities.append(similarity)
avg_similarity = (
sum(similarities) / len(similarities) if similarities else 0.0
)
# Calculate complexity metrics
complexities = []
for block in group:
complexity = self.complexity_analyzer.analyze_code(block.content)
complexities.append(complexity.get_overall_score())
max_complexity = max(complexities) if complexities else 0.0
match = DuplicateMatch(
blocks=group,
similarity_score=avg_similarity,
match_type="lsh_cluster",
description=f"LSH cluster with {len(group)} blocks (similarity: {avg_similarity:.3f})",
complexity_score=max_complexity,
priority_score=self._calculate_priority_score(
avg_similarity, max_complexity, len(group)
),
)
matches.append(match)
return self._filter_duplicate_matches(matches)
def _detect_with_similarity(self, blocks: list[CodeBlock]) -> list[DuplicateMatch]:
"""Detect duplicates using similarity algorithms."""
matches = self.matcher.find_all_duplicates(blocks)
return self._filter_duplicate_matches(matches)
def _filter_duplicate_matches(
self, matches: list[DuplicateMatch]
) -> list[DuplicateMatch]:
"""Filter duplicate matches based on exception rules."""
if not self.exception_filter:
return matches
filtered_matches = []
for match in matches:
# Check if any block in the match should be suppressed
should_suppress_match = False
for block in match.blocks:
should_suppress, reason = self.exception_filter.should_suppress_issue(
"duplicates",
"duplicate_code",
block.file_path,
block.start_line,
block.content,
)
if should_suppress:
should_suppress_match = True
break
if not should_suppress_match:
filtered_matches.append(match)
return filtered_matches
def _calculate_priority_score(
self, similarity: float, complexity: float, block_count: int
) -> float:
"""Calculate priority score for refactoring."""
# Base score from similarity
priority = similarity
# Boost for high complexity
if complexity > 50:
priority += 0.2
# Boost for more duplicates
if block_count > 2:
priority += 0.1 * (block_count - 2)
return min(priority, 1.0)
def _generate_refactoring_suggestions(
self, duplicate_match: DuplicateMatch
) -> list[str]:
"""Generate refactoring suggestions for duplicate code."""
suggestions = []
if len(duplicate_match.blocks) < 2:
return suggestions
first_block = duplicate_match.blocks[0]
# Analyze code structure
try:
tree = ast.parse(first_block.content)
# Check if it's a function
has_function = any(
isinstance(node, ast.FunctionDef) for node in ast.walk(tree)
)
has_class = any(isinstance(node, ast.ClassDef) for node in ast.walk(tree))
if has_function:
suggestions.append(
"Extract common function into a shared utility module"
)
suggestions.append(
"Consider creating a base function with configurable parameters"
)
elif has_class:
suggestions.append("Extract common class into a base class or mixin")
suggestions.append("Consider using inheritance or composition patterns")
else:
suggestions.append("Extract duplicate code into a reusable function")
suggestions.append(
"Consider creating a utility module for shared logic"
)
# Complexity-based suggestions
if duplicate_match.complexity_score > 60:
suggestions.append(
"High complexity detected - consider breaking down into smaller functions"
)
# Similarity-based suggestions
if duplicate_match.similarity_score > 0.95:
suggestions.append(
"Nearly identical code - prioritize for immediate refactoring"
)
elif duplicate_match.similarity_score > 0.8:
suggestions.append("Similar code - consider parameterizing differences")
except SyntaxError:
suggestions.append("Extract duplicate code into a reusable component")
return suggestions
def _estimate_refactoring_effort(self, duplicate_match: DuplicateMatch) -> str:
"""Estimate effort required for refactoring."""
if not duplicate_match.blocks:
return "Unknown"
total_lines = sum(
block.end_line - block.start_line + 1 for block in duplicate_match.blocks
)
if total_lines < 20:
return "Low (1-2 hours)"
elif total_lines < 100:
return "Medium (0.5-1 day)"
elif total_lines < 500:
return "High (1-3 days)"
else:
return "Very High (1+ weeks)"
def _assess_refactoring_risk(self, duplicate_match: DuplicateMatch) -> str:
"""Assess risk level of refactoring."""
risk_factors = []
if duplicate_match.complexity_score > 70:
risk_factors.append("High complexity")
if len(duplicate_match.blocks) > 5:
risk_factors.append("Many duplicates")
if duplicate_match.similarity_score < 0.85:
risk_factors.append("Moderate differences between duplicates")
# Check if duplicates span multiple files
unique_files = len(set(block.file_path for block in duplicate_match.blocks))
if unique_files > 3:
risk_factors.append("Cross-module dependencies")
if not risk_factors:
return "Low"
elif len(risk_factors) <= 2:
return "Medium"
else:
return "High"
def _get_content_preview(self, content: str, max_lines: int = 5) -> str:
"""Get a preview of code content."""
lines = content.split("\n")
if len(lines) <= max_lines:
return content
preview_lines = lines[:max_lines]
return "\n".join(preview_lines) + f"\n... ({len(lines) - max_lines} more lines)"

View File

@@ -0,0 +1,296 @@
"""Duplicate matching algorithms and strategies."""
from collections import defaultdict
from typing import Any
from ..config.schemas import DetectionConfig
from ..core.base import CodeBlock, DuplicateMatch
from ..similarity.base import SimilarityCalculator
class DuplicateMatcher:
"""Handles matching logic for finding duplicates."""
def __init__(
self, similarity_calculator: SimilarityCalculator, config: DetectionConfig
):
self.similarity_calculator = similarity_calculator
self.config = config
def find_all_duplicates(self, blocks: list[CodeBlock]) -> list[DuplicateMatch]:
"""Find all duplicate matches in a list of code blocks."""
if len(blocks) < 2:
return []
matches = []
processed_pairs = set()
for i, block1 in enumerate(blocks):
for j, block2 in enumerate(blocks[i + 1 :], i + 1):
pair = (i, j)
if pair in processed_pairs:
continue
similarity = self.similarity_calculator.calculate_similarity(
block1, block2
)
if similarity >= self.config.similarity_threshold:
match_type = "exact" if similarity >= 0.95 else "similar"
match = DuplicateMatch(
blocks=[block1, block2],
similarity_score=similarity,
match_type=match_type,
description=f"{match_type.title()} match between 2 blocks (similarity: {similarity:.3f})",
complexity_score=0.0, # Will be calculated by engine
priority_score=similarity,
)
matches.append(match)
processed_pairs.add(pair)
return self._merge_overlapping_matches(matches)
def find_duplicates_of_block(
self, target_block: CodeBlock, candidate_blocks: list[CodeBlock]
) -> list[DuplicateMatch]:
"""Find duplicates of a specific block."""
matches = []
for candidate in candidate_blocks:
if candidate == target_block:
continue
similarity = self.similarity_calculator.calculate_similarity(
target_block, candidate
)
if similarity >= self.config.similarity_threshold:
match_type = "exact" if similarity >= 0.95 else "similar"
match = DuplicateMatch(
blocks=[target_block, candidate],
similarity_score=similarity,
match_type=match_type,
description=f"{match_type.title()} match with target block (similarity: {similarity:.3f})",
complexity_score=0.0,
priority_score=similarity,
)
matches.append(match)
return matches
def find_similar_blocks(
self,
target_block: CodeBlock,
candidate_blocks: list[CodeBlock],
threshold: float,
) -> list[tuple[CodeBlock, float]]:
"""Find blocks similar to target with custom threshold."""
similar_blocks = []
for candidate in candidate_blocks:
if candidate == target_block:
continue
similarity = self.similarity_calculator.calculate_similarity(
target_block, candidate
)
if similarity >= threshold:
similar_blocks.append((candidate, similarity))
# Sort by similarity descending
similar_blocks.sort(key=lambda x: x[1], reverse=True)
return similar_blocks
def group_similar_blocks(self, blocks: list[CodeBlock]) -> list[list[CodeBlock]]:
"""Group blocks into clusters of similar code."""
if len(blocks) < 2:
return [[block] for block in blocks]
# Build similarity matrix
similarity_matrix = {}
for i, block1 in enumerate(blocks):
for j, block2 in enumerate(blocks[i + 1 :], i + 1):
similarity = self.similarity_calculator.calculate_similarity(
block1, block2
)
similarity_matrix[(i, j)] = similarity
# Use Union-Find to group similar blocks
parent = list(range(len(blocks)))
def find(x: int) -> int:
if parent[x] != x:
parent[x] = find(parent[x])
return parent[x]
def union(x: int, y: int) -> None:
px, py = find(x), find(y)
if px != py:
parent[px] = py
# Union blocks that are similar enough
for (i, j), similarity in similarity_matrix.items():
if similarity >= self.config.similarity_threshold:
union(i, j)
# Group blocks by their root parent
groups = defaultdict(list)
for i, block in enumerate(blocks):
root = find(i)
groups[root].append(block)
return list(groups.values())
def calculate_match_confidence(self, match: DuplicateMatch) -> dict[str, Any]:
"""Calculate confidence metrics for a duplicate match."""
if len(match.blocks) < 2:
return {"confidence": 0.0, "factors": []}
confidence_factors = []
total_confidence = 0.0
# Similarity-based confidence
similarity_confidence = match.similarity_score
confidence_factors.append(
{
"factor": "similarity_score",
"value": match.similarity_score,
"weight": 0.4,
"contribution": similarity_confidence * 0.4,
}
)
total_confidence += similarity_confidence * 0.4
# Length-based confidence (longer matches are more reliable)
avg_length = sum(len(block.content) for block in match.blocks) / len(
match.blocks
)
length_confidence = min(avg_length / 1000, 1.0) # Normalize to [0,1]
confidence_factors.append(
{
"factor": "code_length",
"value": avg_length,
"weight": 0.2,
"contribution": length_confidence * 0.2,
}
)
total_confidence += length_confidence * 0.2
# Token count confidence
avg_tokens = sum(len(block.content.split()) for block in match.blocks) / len(
match.blocks
)
token_confidence = min(avg_tokens / 100, 1.0) # Normalize to [0,1]
confidence_factors.append(
{
"factor": "token_count",
"value": avg_tokens,
"weight": 0.2,
"contribution": token_confidence * 0.2,
}
)
total_confidence += token_confidence * 0.2
# Structural complexity confidence
complexity_confidence = min(match.complexity_score / 100, 1.0)
confidence_factors.append(
{
"factor": "complexity_score",
"value": match.complexity_score,
"weight": 0.2,
"contribution": complexity_confidence * 0.2,
}
)
total_confidence += complexity_confidence * 0.2
return {
"confidence": round(total_confidence, 3),
"level": self._get_confidence_level(total_confidence),
"factors": confidence_factors,
}
def _merge_overlapping_matches(
self, matches: list[DuplicateMatch]
) -> list[DuplicateMatch]:
"""Merge matches that share code blocks."""
if len(matches) <= 1:
return matches
# Group matches by overlapping blocks
block_to_matches = defaultdict(list)
for i, match in enumerate(matches):
for block in match.blocks:
block_to_matches[id(block)].append(i)
# Find groups of overlapping matches
processed = set()
merged_matches = []
for i, match in enumerate(matches):
if i in processed:
continue
# Find all matches that overlap with this one
overlapping = {i}
to_check = [i]
while to_check:
current = to_check.pop()
processed.add(current)
for block in matches[current].blocks:
for match_idx in block_to_matches[id(block)]:
if match_idx not in overlapping:
overlapping.add(match_idx)
to_check.append(match_idx)
if len(overlapping) == 1:
# No overlaps, keep original match
merged_matches.append(match)
else:
# Merge overlapping matches
all_blocks = []
similarities = []
complexity_scores = []
for idx in overlapping:
all_blocks.extend(matches[idx].blocks)
similarities.append(matches[idx].similarity_score)
complexity_scores.append(matches[idx].complexity_score)
# Remove duplicate blocks
unique_blocks = []
seen_blocks = set()
for block in all_blocks:
block_id = (block.file_path, block.start_line, block.end_line)
if block_id not in seen_blocks:
unique_blocks.append(block)
seen_blocks.add(block_id)
# Create merged match
avg_score = sum(similarities) / len(similarities)
merged_match = DuplicateMatch(
blocks=unique_blocks,
similarity_score=avg_score,
match_type="merged_cluster",
description=f"Merged cluster with {len(unique_blocks)} blocks (avg similarity: {avg_score:.3f})",
complexity_score=max(complexity_scores)
if complexity_scores
else 0.0,
priority_score=avg_score,
)
merged_matches.append(merged_match)
return merged_matches
def _get_confidence_level(self, confidence: float) -> str:
"""Get human-readable confidence level."""
if confidence >= 0.8:
return "High"
elif confidence >= 0.6:
return "Medium"
elif confidence >= 0.4:
return "Low"
else:
return "Very Low"

View File

@@ -0,0 +1,63 @@
"""Similarity algorithms for code analysis."""
from .base import (
BaseSimilarityAlgorithm,
SimilarityCalculator,
)
from .lsh import (
BandingLSH,
LSHDuplicateDetector,
LSHSimilarity,
)
from .semantic import (
FunctionalSimilarity,
HashSimilarity,
SemanticSimilarity,
)
from .structural import (
DependencySimilarity,
IdentifierSimilarity,
StructuralSimilarity,
TreeEditDistance,
)
from .text_based import (
DifflibSimilarity,
LevenshteinSimilarity,
LongestCommonSubsequence,
NGramSimilarity,
)
from .token_based import (
CosineSimilarity,
JaccardSimilarity,
ShingleSimilarity,
TFIDFSimilarity,
)
__all__ = [
# Base classes
"BaseSimilarityAlgorithm",
"SimilarityCalculator",
# Text-based algorithms
"LevenshteinSimilarity",
"DifflibSimilarity",
"LongestCommonSubsequence",
"NGramSimilarity",
# Token-based algorithms
"JaccardSimilarity",
"CosineSimilarity",
"TFIDFSimilarity",
"ShingleSimilarity",
# Structural algorithms
"StructuralSimilarity",
"TreeEditDistance",
"DependencySimilarity",
"IdentifierSimilarity",
# Semantic algorithms
"SemanticSimilarity",
"FunctionalSimilarity",
"HashSimilarity",
# LSH algorithms
"LSHSimilarity",
"LSHDuplicateDetector",
"BandingLSH",
]

View File

@@ -0,0 +1,130 @@
"""Base similarity calculation framework."""
from abc import ABC, abstractmethod
from typing import Any
from ..config.schemas import SimilarityAlgorithmConfig
from ..core.base import CodeBlock
class BaseSimilarityAlgorithm(ABC):
"""Base class for similarity algorithms."""
def __init__(self, config: SimilarityAlgorithmConfig | None = None):
self.config = config or SimilarityAlgorithmConfig(
name=self.__class__.__name__.lower()
)
@abstractmethod
def calculate(self, text1: str, text2: str) -> float:
"""Calculate similarity between two text strings."""
...
@property
def name(self) -> str:
"""Get algorithm name."""
return self.config.name
@property
def weight(self) -> float:
"""Get algorithm weight."""
return self.config.weight
@property
def enabled(self) -> bool:
"""Check if algorithm is enabled."""
return self.config.enabled
class SimilarityCalculator:
"""Main similarity calculator that combines multiple algorithms."""
def __init__(self, algorithms: list[BaseSimilarityAlgorithm] | None = None):
self.algorithms = algorithms or []
self._normalize_weights()
def add_algorithm(self, algorithm: BaseSimilarityAlgorithm) -> None:
"""Add a similarity algorithm."""
self.algorithms.append(algorithm)
self._normalize_weights()
def calculate_similarity(self, block1: CodeBlock, block2: CodeBlock) -> float:
"""Calculate weighted similarity between two code blocks."""
if not self.algorithms:
return 0.0
total_score = 0.0
total_weight = 0.0
for algorithm in self.algorithms:
if not algorithm.enabled:
continue
try:
score = algorithm.calculate(
block1.normalized_content, block2.normalized_content
)
total_score += score * algorithm.weight
total_weight += algorithm.weight
except Exception:
# Skip algorithm if it fails
continue
return total_score / total_weight if total_weight > 0 else 0.0
def calculate_detailed_similarity(
self, block1: CodeBlock, block2: CodeBlock
) -> dict[str, float]:
"""Calculate similarity with breakdown by algorithm."""
results = {}
for algorithm in self.algorithms:
if not algorithm.enabled:
continue
try:
score = algorithm.calculate(
block1.normalized_content, block2.normalized_content
)
results[algorithm.name] = score
except Exception:
results[algorithm.name] = 0.0
# Calculate weighted average
total_score = sum(
results[alg.name] * alg.weight
for alg in self.algorithms
if alg.enabled and alg.name in results
)
total_weight = sum(alg.weight for alg in self.algorithms if alg.enabled)
results["weighted_average"] = (
total_score / total_weight if total_weight > 0 else 0.0
)
return results
def _normalize_weights(self) -> None:
"""Normalize algorithm weights to sum to 1.0."""
enabled_algorithms = [alg for alg in self.algorithms if alg.enabled]
if not enabled_algorithms:
return
total_weight = sum(alg.weight for alg in enabled_algorithms)
if total_weight > 0:
for algorithm in enabled_algorithms:
algorithm.config.weight = algorithm.weight / total_weight
def get_algorithm_info(self) -> list[dict[str, Any]]:
"""Get information about all algorithms."""
return [
{
"name": alg.name,
"weight": alg.weight,
"enabled": alg.enabled,
"class": alg.__class__.__name__,
}
for alg in self.algorithms
]

View File

@@ -0,0 +1,326 @@
"""LSH-based similarity for efficient large-scale duplicate detection."""
import hashlib
from collections import defaultdict
from typing import Any
try:
from datasketch import MinHash, MinHashLSH
LSH_AVAILABLE = True
except ImportError:
LSH_AVAILABLE = False
from ..config.schemas import SimilarityAlgorithmConfig
from ..core.base import CodeBlock
from .base import BaseSimilarityAlgorithm
class LSHSimilarity(BaseSimilarityAlgorithm):
"""LSH-based similarity for efficient approximate matching."""
def __init__(self, config: SimilarityAlgorithmConfig | None = None):
if config is None:
config = SimilarityAlgorithmConfig(
name="lsh",
weight=0.1,
parameters={"threshold": 0.8, "num_perm": 128, "bands": 16, "rows": 8},
)
super().__init__(config)
# LSH parameters
self.threshold = self.config.parameters.get("threshold", 0.8)
self.num_perm = self.config.parameters.get("num_perm", 128)
self.bands = self.config.parameters.get("bands", 16)
self.rows = self.config.parameters.get("rows", 8)
# Initialize LSH index
self.lsh_index = None
self.minhashes = {}
if LSH_AVAILABLE:
self._initialize_lsh()
def _initialize_lsh(self) -> None:
"""Initialize LSH index."""
if LSH_AVAILABLE:
self.lsh_index = MinHashLSH(
threshold=self.threshold, num_perm=self.num_perm
)
def calculate(self, text1: str, text2: str) -> float:
"""Calculate similarity using MinHash."""
if not LSH_AVAILABLE:
# Fallback to simple text similarity
return self._fallback_similarity(text1, text2)
if not text1 and not text2:
return 1.0
if not text1 or not text2:
return 0.0
minhash1 = self._create_minhash(text1)
minhash2 = self._create_minhash(text2)
return minhash1.jaccard(minhash2)
def _create_minhash(self, text: str) -> Any:
"""Create MinHash for text."""
if not LSH_AVAILABLE:
return None
minhash = MinHash(num_perm=self.num_perm)
# Create shingles from text
shingles = self._get_shingles(text)
for shingle in shingles:
minhash.update(shingle.encode("utf-8"))
return minhash
def _get_shingles(self, text: str, k: int = 4) -> set[str]:
"""Generate character k-shingles from text."""
# Normalize text
normalized = text.lower().replace(" ", "").replace("\n", "").replace("\t", "")
if len(normalized) < k:
return {normalized}
return {normalized[i : i + k] for i in range(len(normalized) - k + 1)}
def _fallback_similarity(self, text1: str, text2: str) -> float:
"""Fallback similarity when LSH is not available."""
if not text1 and not text2:
return 1.0
if not text1 or not text2:
return 0.0
# Simple Jaccard similarity on character 4-grams
shingles1 = self._get_shingles(text1)
shingles2 = self._get_shingles(text2)
if not shingles1 and not shingles2:
return 1.0
if not shingles1 or not shingles2:
return 0.0
intersection = len(shingles1.intersection(shingles2))
union = len(shingles1.union(shingles2))
return intersection / union if union > 0 else 0.0
class LSHDuplicateDetector:
"""High-performance duplicate detection using LSH."""
def __init__(
self,
threshold: float = 0.8,
num_perm: int = 128,
bands: int = 16,
rows: int = 8,
):
self.threshold = threshold
self.num_perm = num_perm
self.bands = bands
self.rows = rows
self.lsh_index = None
self.minhashes = {}
self.code_blocks = {}
if LSH_AVAILABLE:
self.lsh_index = MinHashLSH(threshold=threshold, num_perm=num_perm)
def add_code_block(self, block: CodeBlock) -> None:
"""Add a code block to the LSH index."""
if not LSH_AVAILABLE:
return
block_id = self._get_block_id(block)
minhash = self._create_minhash(block.normalized_content)
self.minhashes[block_id] = minhash
self.code_blocks[block_id] = block
if self.lsh_index:
self.lsh_index.insert(block_id, minhash)
def find_similar_blocks(self, block: CodeBlock) -> list[tuple[CodeBlock, float]]:
"""Find similar blocks using LSH."""
if not LSH_AVAILABLE or not self.lsh_index:
return []
block_id = self._get_block_id(block)
query_minhash = self._create_minhash(block.normalized_content)
# Get candidate similar blocks
candidates = self.lsh_index.query(query_minhash)
similar_blocks = []
for candidate_id in candidates:
if candidate_id == block_id:
continue
candidate_block = self.code_blocks.get(candidate_id)
if candidate_block:
# Calculate exact similarity
similarity = query_minhash.jaccard(self.minhashes[candidate_id])
if similarity >= self.threshold:
similar_blocks.append((candidate_block, similarity))
# Sort by similarity descending
similar_blocks.sort(key=lambda x: x[1], reverse=True)
return similar_blocks
def find_all_duplicates(self) -> list[list[CodeBlock]]:
"""Find all duplicate groups using LSH."""
if not LSH_AVAILABLE or not self.lsh_index:
return []
duplicate_groups = []
processed = set()
for block_id, block in self.code_blocks.items():
if block_id in processed:
continue
similar_blocks = self.find_similar_blocks(block)
if similar_blocks:
# Create group with original block and similar blocks
group = [block]
group.extend([similar_block for similar_block, _ in similar_blocks])
# Mark all blocks in group as processed
processed.add(block_id)
for similar_block, _ in similar_blocks:
similar_id = self._get_block_id(similar_block)
processed.add(similar_id)
duplicate_groups.append(group)
return duplicate_groups
def get_statistics(self) -> dict[str, Any]:
"""Get LSH index statistics."""
if not LSH_AVAILABLE or not self.lsh_index:
return {"error": "LSH not available"}
return {
"total_blocks": len(self.code_blocks),
"threshold": self.threshold,
"num_perm": self.num_perm,
"lsh_available": LSH_AVAILABLE,
"index_keys": len(self.lsh_index.keys)
if hasattr(self.lsh_index, "keys")
else 0,
}
def _create_minhash(self, text: str) -> Any:
"""Create MinHash for text."""
if not LSH_AVAILABLE:
return None
minhash = MinHash(num_perm=self.num_perm)
# Create token-based shingles
shingles = self._get_token_shingles(text)
for shingle in shingles:
minhash.update(shingle.encode("utf-8"))
return minhash
def _get_token_shingles(self, text: str, k: int = 3) -> set[str]:
"""Generate token k-shingles from text."""
import re
# Tokenize text
tokens = re.findall(r"\w+", text.lower())
if len(tokens) < k:
return {" ".join(tokens)}
return {" ".join(tokens[i : i + k]) for i in range(len(tokens) - k + 1)}
def _get_block_id(self, block: CodeBlock) -> str:
"""Generate unique ID for code block."""
content = f"{block.file_path}:{block.start_line}:{block.end_line}"
return hashlib.md5(content.encode()).hexdigest()
class BandingLSH:
"""Custom LSH implementation with banding technique."""
def __init__(self, bands: int = 20, rows: int = 5, threshold: float = 0.8):
self.bands = bands
self.rows = rows
self.threshold = threshold
self.hash_tables: list[defaultdict[int, set[str]]] = [
defaultdict(set) for _ in range(bands)
]
self.signatures: dict[str, list[int]] = {}
def add_signature(self, item_id: str, signature: list[int]) -> None:
"""Add signature to LSH buckets."""
if len(signature) != self.bands * self.rows:
raise ValueError(
f"Signature length {len(signature)} != {self.bands * self.rows}"
)
self.signatures[item_id] = signature
# Hash each band
for band_idx in range(self.bands):
start = band_idx * self.rows
end = start + self.rows
band_signature = tuple(signature[start:end])
# Hash the band
band_hash = hash(band_signature)
self.hash_tables[band_idx][band_hash].add(item_id)
def find_candidates(self, query_id: str) -> set[str]:
"""Find candidate similar items."""
if query_id not in self.signatures:
return set()
candidates = set()
query_signature = self.signatures[query_id]
# Check each band
for band_idx in range(self.bands):
start = band_idx * self.rows
end = start + self.rows
band_signature = tuple(query_signature[start:end])
band_hash = hash(band_signature)
candidates.update(self.hash_tables[band_idx][band_hash])
# Remove query item itself
candidates.discard(query_id)
return candidates
def estimate_jaccard(self, sig1: list[int], sig2: list[int]) -> float:
"""Estimate Jaccard similarity from signatures."""
if len(sig1) != len(sig2):
return 0.0
matches = sum(1 for a, b in zip(sig1, sig2, strict=False) if a == b)
return matches / len(sig1)
def get_statistics(self) -> dict[str, Any]:
"""Get LSH statistics."""
total_buckets = sum(len(table) for table in self.hash_tables)
avg_bucket_size = total_buckets / self.bands if self.bands > 0 else 0
return {
"bands": self.bands,
"rows": self.rows,
"total_items": len(self.signatures),
"total_buckets": total_buckets,
"avg_bucket_size": avg_bucket_size,
"threshold": self.threshold,
}

View File

@@ -0,0 +1,398 @@
"""Semantic similarity algorithms for code analysis."""
import ast
import hashlib
import re
from collections import Counter
from ..config.schemas import SimilarityAlgorithmConfig
from .base import BaseSimilarityAlgorithm
class SemanticSimilarity(BaseSimilarityAlgorithm):
"""Semantic similarity algorithm based on normalized code patterns."""
def __init__(self, config: SimilarityAlgorithmConfig | None = None):
if config is None:
config = SimilarityAlgorithmConfig(name="semantic", weight=0.2)
super().__init__(config)
def calculate(self, text1: str, text2: str) -> float:
"""Calculate similarity based on semantic patterns."""
if not text1 and not text2:
return 1.0
if not text1 or not text2:
return 0.0
# Normalize both texts for semantic comparison
normalized1 = self._normalize_code(text1)
normalized2 = self._normalize_code(text2)
# Calculate multiple semantic similarities
pattern_sim = self._pattern_similarity(normalized1, normalized2)
concept_sim = self._concept_similarity(text1, text2)
structure_sim = self._semantic_structure_similarity(text1, text2)
# Weighted combination
return pattern_sim * 0.4 + concept_sim * 0.4 + structure_sim * 0.2
def _normalize_code(self, code: str) -> str:
"""Normalize code for semantic comparison."""
# Remove comments
code = re.sub(r"#.*$", "", code, flags=re.MULTILINE)
code = re.sub(r'""".*?"""', "", code, flags=re.DOTALL)
code = re.sub(r"'''.*?'''", "", code, flags=re.DOTALL)
# Normalize whitespace
code = re.sub(r"\s+", " ", code).strip()
# Normalize variable names to generic patterns
code = re.sub(r"\b[a-z_][a-z0-9_]*\b", "VAR", code)
# Normalize string literals
code = re.sub(r'"[^"]*"', "STR", code)
code = re.sub(r"'[^']*'", "STR", code)
# Normalize numbers
code = re.sub(r"\b\d+\.?\d*\b", "NUM", code)
return code
def _pattern_similarity(self, normalized1: str, normalized2: str) -> float:
"""Compare normalized code patterns."""
if not normalized1 and not normalized2:
return 1.0
if not normalized1 or not normalized2:
return 0.0
import difflib
return difflib.SequenceMatcher(None, normalized1, normalized2).ratio()
def _concept_similarity(self, code1: str, code2: str) -> float:
"""Compare conceptual similarity using keywords and operations."""
concepts1 = self._extract_concepts(code1)
concepts2 = self._extract_concepts(code2)
if not concepts1 and not concepts2:
return 1.0
if not concepts1 or not concepts2:
return 0.0
# Calculate cosine similarity on concept frequencies
all_concepts = set(concepts1.keys()) | set(concepts2.keys())
dot_product = sum(
concepts1.get(concept, 0) * concepts2.get(concept, 0)
for concept in all_concepts
)
magnitude1 = (
sum(concepts1.get(concept, 0) ** 2 for concept in all_concepts) ** 0.5
)
magnitude2 = (
sum(concepts2.get(concept, 0) ** 2 for concept in all_concepts) ** 0.5
)
if magnitude1 == 0 or magnitude2 == 0:
return 0.0
return dot_product / (magnitude1 * magnitude2)
def _extract_concepts(self, code: str) -> Counter[str]:
"""Extract conceptual elements from code."""
concepts = Counter()
# Python keywords and operations
python_concepts = {
"def",
"class",
"if",
"else",
"elif",
"for",
"while",
"try",
"except",
"finally",
"with",
"return",
"yield",
"import",
"from",
"as",
"and",
"or",
"not",
"in",
"is",
"lambda",
"pass",
"break",
"continue",
}
# Extract words
words = re.findall(r"\b\w+\b", code.lower())
for word in words:
if word in python_concepts:
concepts[f"keyword:{word}"] += 1
elif word in ["len", "str", "int", "float", "list", "dict", "set", "tuple"]:
concepts[f"builtin:{word}"] += 1
elif word.endswith("error") or word.endswith("exception"):
concepts["error_handling"] += 1
elif word in ["print", "log", "debug", "info", "warn", "error"]:
concepts["logging"] += 1
elif word in ["open", "read", "write", "close", "file"]:
concepts["file_io"] += 1
elif word in ["get", "post", "put", "delete", "request", "response"]:
concepts["http"] += 1
elif word in ["query", "select", "insert", "update", "delete", "database"]:
concepts["database"] += 1
# Extract operators and patterns
operators = re.findall(r"[+\-*/=<>!&|^~%]", code)
for op in operators:
concepts[f"operator:{op}"] += 1
return concepts
def _semantic_structure_similarity(self, code1: str, code2: str) -> float:
"""Compare semantic structure patterns."""
try:
import ast
tree1 = ast.parse(code1)
tree2 = ast.parse(code2)
patterns1 = self._extract_semantic_patterns(tree1)
patterns2 = self._extract_semantic_patterns(tree2)
return self._compare_pattern_sets(patterns1, patterns2)
except SyntaxError:
return 0.0
def _extract_semantic_patterns(self, tree: ast.AST) -> set[str]:
"""Extract semantic patterns from AST."""
patterns = set()
for node in ast.walk(tree):
if isinstance(node, ast.FunctionDef):
# Function signature patterns
arg_count = len(node.args.args)
patterns.add(f"function_args:{arg_count}")
# Check for common patterns
if any(isinstance(n, ast.Return) for n in ast.walk(node)):
patterns.add("function_returns")
if any(isinstance(n, ast.Yield) for n in ast.walk(node)):
patterns.add("generator_function")
elif isinstance(node, ast.ClassDef):
# Class patterns
base_count = len(node.bases)
patterns.add(f"class_inheritance:{base_count}")
elif isinstance(node, ast.Try):
# Exception handling patterns
patterns.add("exception_handling")
if node.finalbody:
patterns.add("finally_block")
elif isinstance(node, ast.With):
# Context manager pattern
patterns.add("context_manager")
elif isinstance(node, ast.ListComp):
patterns.add("list_comprehension")
elif isinstance(node, ast.DictComp):
patterns.add("dict_comprehension")
elif isinstance(node, ast.SetComp):
patterns.add("set_comprehension")
elif isinstance(node, ast.Lambda):
patterns.add("lambda_function")
elif isinstance(
node, (ast.FunctionDef, ast.ClassDef, ast.AsyncFunctionDef)
):
if node.decorator_list:
patterns.add("decorator_usage")
return patterns
def _compare_pattern_sets(self, patterns1: set[str], patterns2: set[str]) -> float:
"""Compare two sets of semantic patterns."""
if not patterns1 and not patterns2:
return 1.0
if not patterns1 or not patterns2:
return 0.0
intersection = len(patterns1.intersection(patterns2))
union = len(patterns1.union(patterns2))
return intersection / union if union > 0 else 0.0
class FunctionalSimilarity(BaseSimilarityAlgorithm):
"""Similarity based on functional behavior patterns."""
def __init__(self, config: SimilarityAlgorithmConfig | None = None):
if config is None:
config = SimilarityAlgorithmConfig(name="functional", weight=0.15)
super().__init__(config)
def calculate(self, text1: str, text2: str) -> float:
"""Calculate similarity based on functional patterns."""
if not text1 and not text2:
return 1.0
if not text1 or not text2:
return 0.0
try:
import ast
tree1 = ast.parse(text1)
tree2 = ast.parse(text2)
behavior1 = self._extract_behavioral_patterns(tree1)
behavior2 = self._extract_behavioral_patterns(tree2)
return self._compare_behaviors(behavior1, behavior2)
except SyntaxError:
return 0.0
def _extract_behavioral_patterns(self, tree: ast.AST) -> dict[str, int]:
"""Extract behavioral patterns from AST."""
patterns = {
"data_access": 0, # Reading/accessing data
"data_mutation": 0, # Modifying data
"control_flow": 0, # Conditional logic
"iteration": 0, # Loops and iteration
"function_calls": 0, # Function invocations
"exception_handling": 0, # Error handling
"io_operations": 0, # Input/output
"mathematical": 0, # Math operations
}
for node in ast.walk(tree):
if isinstance(node, (ast.Subscript, ast.Attribute)):
patterns["data_access"] += 1
elif isinstance(node, (ast.Assign, ast.AugAssign)):
patterns["data_mutation"] += 1
elif isinstance(node, ast.If):
patterns["control_flow"] += 1
elif isinstance(node, (ast.For, ast.While)):
patterns["iteration"] += 1
elif isinstance(node, ast.Call):
patterns["function_calls"] += 1
# Check for specific types of calls
if isinstance(node.func, ast.Name):
func_name = node.func.id.lower()
if func_name in ["print", "input", "open", "read", "write"]:
patterns["io_operations"] += 1
elif isinstance(node, ast.Try):
patterns["exception_handling"] += 1
elif isinstance(node, ast.BinOp) and isinstance(
node.op, (ast.Add, ast.Sub, ast.Mult, ast.Div, ast.Mod, ast.Pow)
):
patterns["mathematical"] += 1
return patterns
def _compare_behaviors(
self, behavior1: dict[str, int], behavior2: dict[str, int]
) -> float:
"""Compare behavioral patterns."""
if not any(behavior1.values()) and not any(behavior2.values()):
return 1.0
if not any(behavior1.values()) or not any(behavior2.values()):
return 0.0
# Calculate cosine similarity on behavior patterns
all_patterns = set(behavior1.keys()) | set(behavior2.keys())
dot_product = sum(
behavior1.get(pattern, 0) * behavior2.get(pattern, 0)
for pattern in all_patterns
)
magnitude1 = (
sum(behavior1.get(pattern, 0) ** 2 for pattern in all_patterns) ** 0.5
)
magnitude2 = (
sum(behavior2.get(pattern, 0) ** 2 for pattern in all_patterns) ** 0.5
)
if magnitude1 == 0 or magnitude2 == 0:
return 0.0
return dot_product / (magnitude1 * magnitude2)
class HashSimilarity(BaseSimilarityAlgorithm):
"""Similarity based on code content hashing."""
def __init__(self, config: SimilarityAlgorithmConfig | None = None):
if config is None:
config = SimilarityAlgorithmConfig(name="hash", weight=0.1)
super().__init__(config)
def calculate(self, text1: str, text2: str) -> float:
"""Calculate similarity using various hash comparisons."""
if not text1 and not text2:
return 1.0
if not text1 or not text2:
return 0.0
# Multiple hash-based comparisons
exact_match = self._exact_hash_similarity(text1, text2)
if exact_match == 1.0:
return 1.0
normalized_match = self._normalized_hash_similarity(text1, text2)
fuzzy_match = self._fuzzy_hash_similarity(text1, text2)
return max(exact_match, normalized_match, fuzzy_match)
def _exact_hash_similarity(self, text1: str, text2: str) -> float:
"""Check for exact content match."""
hash1 = hashlib.md5(text1.encode()).hexdigest()
hash2 = hashlib.md5(text2.encode()).hexdigest()
return 1.0 if hash1 == hash2 else 0.0
def _normalized_hash_similarity(self, text1: str, text2: str) -> float:
"""Check for normalized content match."""
# Normalize whitespace and comments
normalized1 = re.sub(
r"\s+", " ", re.sub(r"#.*$", "", text1, flags=re.MULTILINE)
).strip()
normalized2 = re.sub(
r"\s+", " ", re.sub(r"#.*$", "", text2, flags=re.MULTILINE)
).strip()
hash1 = hashlib.md5(normalized1.encode()).hexdigest()
hash2 = hashlib.md5(normalized2.encode()).hexdigest()
return 1.0 if hash1 == hash2 else 0.0
def _fuzzy_hash_similarity(self, text1: str, text2: str) -> float:
"""Calculate fuzzy hash similarity using character n-grams."""
# Create character 4-grams for fuzzy matching
ngrams1 = set(text1[i : i + 4] for i in range(len(text1) - 3))
ngrams2 = set(text2[i : i + 4] for i in range(len(text2) - 3))
if not ngrams1 and not ngrams2:
return 1.0
if not ngrams1 or not ngrams2:
return 0.0
intersection = len(ngrams1.intersection(ngrams2))
union = len(ngrams1.union(ngrams2))
jaccard = intersection / union if union > 0 else 0.0
# Return 1.0 only for very high similarity
return 1.0 if jaccard > 0.95 else 0.0

View File

@@ -0,0 +1,399 @@
"""Structural similarity algorithms for code analysis."""
import ast
from collections import Counter
from ..config.schemas import SimilarityAlgorithmConfig
from .base import BaseSimilarityAlgorithm
class StructuralSimilarity(BaseSimilarityAlgorithm):
"""AST-based structural similarity algorithm."""
def __init__(self, config: SimilarityAlgorithmConfig | None = None):
if config is None:
config = SimilarityAlgorithmConfig(name="structural", weight=0.25)
super().__init__(config)
def calculate(self, text1: str, text2: str) -> float:
"""Calculate similarity based on AST structure."""
if not text1 and not text2:
return 1.0
if not text1 or not text2:
return 0.0
try:
tree1 = ast.parse(text1)
tree2 = ast.parse(text2)
except SyntaxError:
# Fallback to text-based comparison for malformed code
return self._fallback_similarity(text1, text2)
structure1 = self._extract_structure(tree1)
structure2 = self._extract_structure(tree2)
return self._compare_structures(structure1, structure2)
def _extract_structure(self, tree: ast.AST) -> list[str]:
"""Extract enhanced structural patterns from AST."""
structure = []
# Track nesting depth for better structural comparison
def visit_node(node: ast.AST, depth: int = 0) -> None:
depth_prefix = f"d{depth}:" if depth > 0 else ""
if isinstance(node, ast.FunctionDef):
# Abstract function names but keep structural information
arg_count = len(node.args.args)
has_decorators = len(node.decorator_list) > 0
structure.append(f"{depth_prefix}function:args{arg_count}:dec{has_decorators}")
# Analyze function body patterns
body_patterns = []
for child in node.body:
if isinstance(child, ast.If):
body_patterns.append("if")
elif isinstance(child, ast.For):
body_patterns.append("for")
elif isinstance(child, ast.While):
body_patterns.append("while")
elif isinstance(child, ast.Try):
body_patterns.append("try")
elif isinstance(child, ast.Return):
body_patterns.append("return")
if body_patterns:
structure.append(f"{depth_prefix}body_pattern:{'_'.join(body_patterns[:5])}")
# Visit children with increased depth
for child in ast.iter_child_nodes(node):
visit_node(child, depth + 1)
elif isinstance(node, ast.AsyncFunctionDef):
arg_count = len(node.args.args)
has_decorators = len(node.decorator_list) > 0
structure.append(f"{depth_prefix}async_function:args{arg_count}:dec{has_decorators}")
for child in ast.iter_child_nodes(node):
visit_node(child, depth + 1)
elif isinstance(node, ast.ClassDef):
# Abstract class names but keep inheritance and structure info
base_count = len(node.bases)
has_decorators = len(node.decorator_list) > 0
structure.append(f"{depth_prefix}class:bases{base_count}:dec{has_decorators}")
# Count methods in class
method_count = sum(1 for child in node.body if isinstance(child, (ast.FunctionDef, ast.AsyncFunctionDef)))
structure.append(f"{depth_prefix}class_methods:{method_count}")
for child in ast.iter_child_nodes(node):
visit_node(child, depth + 1)
elif isinstance(node, ast.If):
# Track conditional structure complexity
elif_count = len([n for n in node.orelse if isinstance(n, ast.If)])
has_else = any(not isinstance(n, ast.If) for n in node.orelse)
structure.append(f"{depth_prefix}if:elif{elif_count}:else{has_else}")
for child in ast.iter_child_nodes(node):
visit_node(child, depth + 1)
elif isinstance(node, ast.For):
# Detect nested loops
is_nested = any(isinstance(child, (ast.For, ast.While)) for child in ast.walk(node))
structure.append(f"{depth_prefix}for:nested{is_nested}")
for child in ast.iter_child_nodes(node):
visit_node(child, depth + 1)
elif isinstance(node, ast.While):
is_nested = any(isinstance(child, (ast.For, ast.While)) for child in ast.walk(node))
structure.append(f"{depth_prefix}while:nested{is_nested}")
for child in ast.iter_child_nodes(node):
visit_node(child, depth + 1)
elif isinstance(node, ast.Try):
except_count = len(node.handlers)
has_finally = bool(node.finalbody)
has_else = bool(node.orelse)
structure.append(f"{depth_prefix}try:except{except_count}:finally{has_finally}:else{has_else}")
for child in ast.iter_child_nodes(node):
visit_node(child, depth + 1)
elif isinstance(node, ast.With):
item_count = len(node.items)
structure.append(f"{depth_prefix}with:items{item_count}")
for child in ast.iter_child_nodes(node):
visit_node(child, depth + 1)
elif isinstance(node, ast.Return):
has_value = node.value is not None
structure.append(f"{depth_prefix}return:value{has_value}")
elif isinstance(node, ast.Assign):
target_count = len(node.targets)
structure.append(f"{depth_prefix}assign:targets{target_count}")
elif isinstance(node, ast.Call):
# Abstract function calls but keep argument structure
arg_count = len(node.args)
kwarg_count = len(node.keywords)
structure.append(f"{depth_prefix}call:args{arg_count}:kwargs{kwarg_count}")
else:
# Visit other node types without adding to structure
for child in ast.iter_child_nodes(node):
visit_node(child, depth)
visit_node(tree)
return structure
def _compare_structures(
self, structure1: list[str], structure2: list[str]
) -> float:
"""Compare two structural patterns."""
if not structure1 and not structure2:
return 1.0
if not structure1 or not structure2:
return 0.0
# Convert to sets for Jaccard similarity on structure
set1 = set(structure1)
set2 = set(structure2)
intersection = len(set1.intersection(set2))
union = len(set1.union(set2))
jaccard = intersection / union if union > 0 else 0.0
# Also consider sequence similarity
sequence_sim = self._sequence_similarity(structure1, structure2)
# Combine Jaccard and sequence similarity
return (jaccard + sequence_sim) / 2
def _sequence_similarity(self, seq1: list[str], seq2: list[str]) -> float:
"""Calculate similarity preserving sequence order."""
if not seq1 and not seq2:
return 1.0
if not seq1 or not seq2:
return 0.0
# Use dynamic programming for longest common subsequence
lcs_length = self._lcs_length(seq1, seq2)
max_length = max(len(seq1), len(seq2))
return lcs_length / max_length if max_length > 0 else 0.0
def _lcs_length(self, seq1: list[str], seq2: list[str]) -> int:
"""Calculate length of longest common subsequence."""
m, n = len(seq1), len(seq2)
# Create DP table
dp = [[0] * (n + 1) for _ in range(m + 1)]
# Fill the DP table
for i in range(1, m + 1):
for j in range(1, n + 1):
if seq1[i - 1] == seq2[j - 1]:
dp[i][j] = dp[i - 1][j - 1] + 1
else:
dp[i][j] = max(dp[i - 1][j], dp[i][j - 1])
return dp[m][n]
def _fallback_similarity(self, text1: str, text2: str) -> float:
"""Fallback to simple text similarity for malformed code."""
import difflib
return difflib.SequenceMatcher(None, text1, text2).ratio()
class TreeEditDistance(BaseSimilarityAlgorithm):
"""Tree edit distance-based similarity algorithm."""
def __init__(self, config: SimilarityAlgorithmConfig | None = None):
if config is None:
config = SimilarityAlgorithmConfig(name="tree_edit", weight=0.2)
super().__init__(config)
def calculate(self, text1: str, text2: str) -> float:
"""Calculate similarity using simplified tree edit distance."""
if not text1 and not text2:
return 1.0
if not text1 or not text2:
return 0.0
try:
tree1 = ast.parse(text1)
tree2 = ast.parse(text2)
except SyntaxError:
# Fallback to text-based comparison
import difflib
return difflib.SequenceMatcher(None, text1, text2).ratio()
# Simplified tree representation
nodes1 = self._get_node_types(tree1)
nodes2 = self._get_node_types(tree2)
# Calculate edit distance
edit_distance = self._edit_distance(nodes1, nodes2)
max_length = max(len(nodes1), len(nodes2))
# Convert to similarity score
return 1 - (edit_distance / max_length) if max_length > 0 else 1.0
def _get_node_types(self, tree: ast.AST) -> list[str]:
"""Extract node types from AST."""
return [type(node).__name__ for node in ast.walk(tree)]
def _edit_distance(self, seq1: list[str], seq2: list[str]) -> int:
"""Calculate edit distance between two sequences."""
m, n = len(seq1), len(seq2)
# Create DP table
dp = [[0] * (n + 1) for _ in range(m + 1)]
# Initialize base cases
for i in range(m + 1):
dp[i][0] = i
for j in range(n + 1):
dp[0][j] = j
# Fill the DP table
for i in range(1, m + 1):
for j in range(1, n + 1):
if seq1[i - 1] == seq2[j - 1]:
dp[i][j] = dp[i - 1][j - 1]
else:
dp[i][j] = 1 + min(
dp[i - 1][j], # deletion
dp[i][j - 1], # insertion
dp[i - 1][j - 1], # substitution
)
return dp[m][n]
class DependencySimilarity(BaseSimilarityAlgorithm):
"""Import and dependency-based similarity algorithm."""
def __init__(self, config: SimilarityAlgorithmConfig | None = None):
if config is None:
config = SimilarityAlgorithmConfig(name="dependency", weight=0.15)
super().__init__(config)
def calculate(self, text1: str, text2: str) -> float:
"""Calculate similarity based on imports and dependencies."""
if not text1 and not text2:
return 1.0
if not text1 or not text2:
return 0.0
try:
deps1 = self._extract_dependencies(text1)
deps2 = self._extract_dependencies(text2)
except SyntaxError:
return 0.0
return self._compare_dependencies(deps1, deps2)
def _extract_dependencies(self, code: str) -> set[str]:
"""Extract import dependencies from code."""
try:
tree = ast.parse(code)
except SyntaxError:
return set()
dependencies = set()
for node in ast.walk(tree):
if isinstance(node, ast.Import):
for alias in node.names:
dependencies.add(alias.name.split(".")[0])
elif isinstance(node, ast.ImportFrom):
if node.module:
dependencies.add(node.module.split(".")[0])
for alias in node.names:
dependencies.add(alias.name)
return dependencies
def _compare_dependencies(self, deps1: set[str], deps2: set[str]) -> float:
"""Compare two sets of dependencies."""
if not deps1 and not deps2:
return 1.0
if not deps1 or not deps2:
return 0.0
intersection = len(deps1.intersection(deps2))
union = len(deps1.union(deps2))
return intersection / union if union > 0 else 0.0
class IdentifierSimilarity(BaseSimilarityAlgorithm):
"""Variable and function name-based similarity algorithm."""
def __init__(self, config: SimilarityAlgorithmConfig | None = None):
if config is None:
config = SimilarityAlgorithmConfig(name="identifier", weight=0.2)
super().__init__(config)
def calculate(self, text1: str, text2: str) -> float:
"""Calculate similarity based on identifier names."""
if not text1 and not text2:
return 1.0
if not text1 or not text2:
return 0.0
try:
identifiers1 = self._extract_identifiers(text1)
identifiers2 = self._extract_identifiers(text2)
except SyntaxError:
return 0.0
return self._compare_identifiers(identifiers1, identifiers2)
def _extract_identifiers(self, code: str) -> Counter[str]:
"""Extract all identifiers from code."""
try:
tree = ast.parse(code)
except SyntaxError:
return Counter()
identifiers = []
for node in ast.walk(tree):
if isinstance(node, ast.Name):
identifiers.append(node.id)
elif isinstance(node, ast.FunctionDef | ast.ClassDef):
identifiers.append(node.name)
elif isinstance(node, ast.Attribute):
identifiers.append(node.attr)
return Counter(identifiers)
def _compare_identifiers(self, ids1: Counter[str], ids2: Counter[str]) -> float:
"""Compare two sets of identifiers."""
if not ids1 and not ids2:
return 1.0
if not ids1 or not ids2:
return 0.0
# Calculate cosine similarity on identifier frequencies
all_ids = set(ids1.keys()) | set(ids2.keys())
dot_product = sum(ids1[id_] * ids2[id_] for id_ in all_ids)
magnitude1 = sum(ids1[id_] ** 2 for id_ in all_ids) ** 0.5
magnitude2 = sum(ids2[id_] ** 2 for id_ in all_ids) ** 0.5
if magnitude1 == 0 or magnitude2 == 0:
return 0.0
return dot_product / (magnitude1 * magnitude2)

View File

@@ -0,0 +1,131 @@
"""Text-based similarity algorithms."""
import difflib
try:
from Levenshtein import ratio as levenshtein_ratio
LEVENSHTEIN_AVAILABLE = True
except ImportError:
LEVENSHTEIN_AVAILABLE = False
from ..config.schemas import SimilarityAlgorithmConfig
from .base import BaseSimilarityAlgorithm
class LevenshteinSimilarity(BaseSimilarityAlgorithm):
"""Levenshtein distance-based similarity algorithm."""
def __init__(self, config: SimilarityAlgorithmConfig | None = None):
if config is None:
config = SimilarityAlgorithmConfig(name="levenshtein", weight=0.2)
super().__init__(config)
def calculate(self, text1: str, text2: str) -> float:
"""Calculate similarity using Levenshtein distance."""
if not text1 and not text2:
return 1.0
if not text1 or not text2:
return 0.0
if LEVENSHTEIN_AVAILABLE:
return levenshtein_ratio(text1, text2)
else:
# Fallback to difflib implementation
return difflib.SequenceMatcher(None, text1, text2).ratio()
class DifflibSimilarity(BaseSimilarityAlgorithm):
"""Python difflib-based similarity algorithm."""
def __init__(self, config: SimilarityAlgorithmConfig | None = None):
if config is None:
config = SimilarityAlgorithmConfig(name="difflib", weight=0.25)
super().__init__(config)
def calculate(self, text1: str, text2: str) -> float:
"""Calculate similarity using difflib SequenceMatcher."""
if not text1 and not text2:
return 1.0
if not text1 or not text2:
return 0.0
return difflib.SequenceMatcher(None, text1, text2).ratio()
class LongestCommonSubsequence(BaseSimilarityAlgorithm):
"""Longest Common Subsequence-based similarity."""
def __init__(self, config: SimilarityAlgorithmConfig | None = None):
if config is None:
config = SimilarityAlgorithmConfig(name="lcs", weight=0.15)
super().__init__(config)
def calculate(self, text1: str, text2: str) -> float:
"""Calculate similarity using LCS."""
if not text1 and not text2:
return 1.0
if not text1 or not text2:
return 0.0
lcs_length = self._lcs_length(text1, text2)
max_length = max(len(text1), len(text2))
return lcs_length / max_length if max_length > 0 else 0.0
def _lcs_length(self, text1: str, text2: str) -> int:
"""Calculate length of longest common subsequence."""
m, n = len(text1), len(text2)
# Create DP table
dp = [[0] * (n + 1) for _ in range(m + 1)]
# Fill the DP table
for i in range(1, m + 1):
for j in range(1, n + 1):
if text1[i - 1] == text2[j - 1]:
dp[i][j] = dp[i - 1][j - 1] + 1
else:
dp[i][j] = max(dp[i - 1][j], dp[i][j - 1])
return dp[m][n]
class NGramSimilarity(BaseSimilarityAlgorithm):
"""N-gram based similarity algorithm."""
def __init__(self, config: SimilarityAlgorithmConfig | None = None):
if config is None:
config = SimilarityAlgorithmConfig(
name="ngram", weight=0.2, parameters={"n": 3}
)
super().__init__(config)
n_param = self.config.parameters.get("n", 3)
self.n: int = int(n_param) if isinstance(n_param, (int, float, str)) else 3
def calculate(self, text1: str, text2: str) -> float:
"""Calculate similarity using n-grams."""
if not text1 and not text2:
return 1.0
if not text1 or not text2:
return 0.0
ngrams1 = set(self._get_ngrams(text1))
ngrams2 = set(self._get_ngrams(text2))
if not ngrams1 and not ngrams2:
return 1.0
if not ngrams1 or not ngrams2:
return 0.0
intersection = len(ngrams1.intersection(ngrams2))
union = len(ngrams1.union(ngrams2))
return intersection / union if union > 0 else 0.0
def _get_ngrams(self, text: str) -> list[str]:
"""Generate n-grams from text."""
if len(text) < self.n:
return [text]
return [text[i : i + self.n] for i in range(len(text) - self.n + 1)]

View File

@@ -0,0 +1,271 @@
"""Token-based similarity algorithms."""
import math
from collections import Counter
from ..config.schemas import SimilarityAlgorithmConfig
from .base import BaseSimilarityAlgorithm
class JaccardSimilarity(BaseSimilarityAlgorithm):
"""Jaccard similarity coefficient algorithm with enhanced tokenization."""
def __init__(self, config: SimilarityAlgorithmConfig | None = None):
if config is None:
config = SimilarityAlgorithmConfig(name="jaccard", weight=0.3)
super().__init__(config)
def calculate(self, text1: str, text2: str) -> float:
"""Calculate similarity using Jaccard coefficient."""
if not text1 and not text2:
return 1.0
if not text1 or not text2:
return 0.0
tokens1 = set(self._tokenize(text1))
tokens2 = set(self._tokenize(text2))
if not tokens1 and not tokens2:
return 1.0
if not tokens1 or not tokens2:
return 0.0
intersection = len(tokens1.intersection(tokens2))
union = len(tokens1.union(tokens2))
return intersection / union if union > 0 else 0.0
def _tokenize(self, text: str) -> list[str]:
"""Enhanced tokenization with semantic grouping for better duplicate detection."""
import re
# Python keywords and built-ins that should be preserved exactly
keywords = {
'def', 'class', 'if', 'else', 'elif', 'for', 'while', 'try', 'except',
'finally', 'with', 'as', 'import', 'from', 'return', 'yield', 'pass',
'break', 'continue', 'and', 'or', 'not', 'in', 'is', 'lambda', 'None',
'True', 'False', 'self', 'cls', 'len', 'range', 'str', 'int', 'float',
'list', 'dict', 'tuple', 'set', 'bool', 'append', 'extend', 'remove'
}
# Semantic variable name patterns (group similar names)
semantic_patterns = [
(r'\b(data|item|element|val|value|obj|object|thing)\w*\b', 'DATA_VAR'),
(r'\b(result|output|ret|return|res|response)\w*\b', 'RESULT_VAR'),
(r'\b(index|idx|i|j|k|counter|count|num|number)\w*\b', 'INDEX_VAR'),
(r'\b(name|id|key|identifier|label)\w*\b', 'ID_VAR'),
(r'\b(config|settings|options|params?|args?|kwargs?)\w*\b', 'CONFIG_VAR'),
(r'\b(path|file|dir|directory|filename)\w*\b', 'PATH_VAR'),
(r'\b(error|err|exception|ex)\w*\b', 'ERROR_VAR'),
(r'\b(temp|tmp|buffer|buf|cache)\w*\b', 'TEMP_VAR'),
(r'\b(min|max|avg|sum|total|count)\w*\b', 'CALC_VAR'),
(r'\b(user|person|client|customer)\w*\b', 'USER_VAR'),
(r'\b(width|height|size|length|dimension)\w*\b', 'SIZE_VAR'),
]
# First pass: extract all tokens
tokens = re.findall(r"\b\w+\b", text.lower())
# Second pass: apply semantic grouping and filtering
processed_tokens = []
for token in tokens:
if len(token) <= 1:
continue
# Keep keywords and built-ins as-is
if token in keywords:
processed_tokens.append(token)
continue
# Apply semantic patterns to group similar variable names
matched = False
for pattern, replacement in semantic_patterns:
if re.match(pattern, token):
processed_tokens.append(replacement)
matched = True
break
if not matched:
# Generic variable abstraction for remaining identifiers
if re.match(r'^[a-zA-Z_]\w*$', token):
processed_tokens.append('VAR')
else:
processed_tokens.append(token)
return processed_tokens
class CosineSimilarity(BaseSimilarityAlgorithm):
"""Cosine similarity algorithm using TF-IDF vectors."""
def __init__(self, config: SimilarityAlgorithmConfig | None = None):
if config is None:
config = SimilarityAlgorithmConfig(name="cosine", weight=0.3)
super().__init__(config)
def calculate(self, text1: str, text2: str) -> float:
"""Calculate similarity using cosine similarity."""
if not text1 and not text2:
return 1.0
if not text1 or not text2:
return 0.0
tokens1 = self._tokenize(text1)
tokens2 = self._tokenize(text2)
if not tokens1 and not tokens2:
return 1.0
if not tokens1 or not tokens2:
return 0.0
# Create term frequency vectors
tf1 = Counter(tokens1)
tf2 = Counter(tokens2)
# Get all unique terms
all_terms = set(tf1.keys()) | set(tf2.keys())
# Calculate cosine similarity
dot_product = sum(tf1[term] * tf2[term] for term in all_terms)
magnitude1 = math.sqrt(sum(tf1[term] ** 2 for term in all_terms))
magnitude2 = math.sqrt(sum(tf2[term] ** 2 for term in all_terms))
if magnitude1 == 0 or magnitude2 == 0:
return 0.0
return dot_product / (magnitude1 * magnitude2)
def _tokenize(self, text: str) -> list[str]:
"""Tokenize text into words/identifiers."""
import re
# Split on whitespace and common delimiters
tokens = re.findall(r"\b\w+\b", text.lower())
return [token for token in tokens if len(token) > 1]
class TFIDFSimilarity(BaseSimilarityAlgorithm):
"""TF-IDF based cosine similarity algorithm."""
def __init__(self, config: SimilarityAlgorithmConfig | None = None):
if config is None:
config = SimilarityAlgorithmConfig(name="tfidf", weight=0.25)
super().__init__(config)
def calculate(self, text1: str, text2: str) -> float:
"""Calculate similarity using TF-IDF weighted cosine similarity."""
if not text1 and not text2:
return 1.0
if not text1 or not text2:
return 0.0
tokens1 = self._tokenize(text1)
tokens2 = self._tokenize(text2)
if not tokens1 and not tokens2:
return 1.0
if not tokens1 or not tokens2:
return 0.0
# Calculate TF for both texts
tf1 = self._calculate_tf(tokens1)
tf2 = self._calculate_tf(tokens2)
# Calculate IDF for all terms
all_terms = set(tf1.keys()) | set(tf2.keys())
idf = self._calculate_idf(all_terms, [tokens1, tokens2])
# Calculate TF-IDF vectors
tfidf1 = {term: tf1.get(term, 0) * idf[term] for term in all_terms}
tfidf2 = {term: tf2.get(term, 0) * idf[term] for term in all_terms}
# Calculate cosine similarity
dot_product = sum(tfidf1[term] * tfidf2[term] for term in all_terms)
magnitude1 = math.sqrt(sum(tfidf1[term] ** 2 for term in all_terms))
magnitude2 = math.sqrt(sum(tfidf2[term] ** 2 for term in all_terms))
if magnitude1 == 0 or magnitude2 == 0:
return 0.0
return dot_product / (magnitude1 * magnitude2)
def _tokenize(self, text: str) -> list[str]:
"""Tokenize text into words/identifiers."""
import re
# Split on whitespace and common delimiters
tokens = re.findall(r"\b\w+\b", text.lower())
return [token for token in tokens if len(token) > 1]
def _calculate_tf(self, tokens: list[str]) -> dict[str, float]:
"""Calculate term frequency."""
tf = Counter(tokens)
total_terms = len(tokens)
return {term: count / total_terms for term, count in tf.items()}
def _calculate_idf(
self, terms: set[str], documents: list[list[str]]
) -> dict[str, float]:
"""Calculate inverse document frequency."""
idf = {}
total_docs = len(documents)
for term in terms:
docs_containing_term = sum(1 for doc in documents if term in doc)
idf[term] = math.log(
total_docs / (docs_containing_term + 1)
) # +1 for smoothing
return idf
class ShingleSimilarity(BaseSimilarityAlgorithm):
"""Shingle-based similarity algorithm."""
def __init__(self, config: SimilarityAlgorithmConfig | None = None):
if config is None:
config = SimilarityAlgorithmConfig(
name="shingle", weight=0.2, parameters={"k": 4}
)
super().__init__(config)
k_param = self.config.parameters.get("k", 4)
self.k: int = int(k_param) if isinstance(k_param, (int, float, str)) else 4
def calculate(self, text1: str, text2: str) -> float:
"""Calculate similarity using k-shingles."""
if not text1 and not text2:
return 1.0
if not text1 or not text2:
return 0.0
shingles1 = set(self._get_shingles(text1))
shingles2 = set(self._get_shingles(text2))
if not shingles1 and not shingles2:
return 1.0
if not shingles1 or not shingles2:
return 0.0
intersection = len(shingles1.intersection(shingles2))
union = len(shingles1.union(shingles2))
return intersection / union if union > 0 else 0.0
def _get_shingles(self, text: str) -> list[str]:
"""Generate k-shingles from text."""
tokens = self._tokenize(text)
if len(tokens) < self.k:
return [" ".join(tokens)]
return [
" ".join(tokens[i : i + self.k]) for i in range(len(tokens) - self.k + 1)
]
def _tokenize(self, text: str) -> list[str]:
"""Tokenize text into words/identifiers."""
import re
# Split on whitespace and common delimiters
tokens = re.findall(r"\b\w+\b", text.lower())
return [token for token in tokens if len(token) > 1]

View File

@@ -0,0 +1 @@
"""Utility modules for the quality analysis package."""

View File

@@ -0,0 +1,222 @@
"""File discovery utilities for quality analysis."""
import fnmatch
from pathlib import Path
from typing import Any
from ..config.schemas import LanguageConfig, PathConfig
class FileFinder:
"""Finds relevant source files for analysis."""
def __init__(self, path_config: PathConfig, language_config: LanguageConfig):
self.path_config = path_config
self.language_config = language_config
def find_files(self, root_path: Path) -> list[Path]:
"""Find all relevant source files in the given path."""
if not root_path.exists():
return []
if root_path.is_file():
return [root_path] if self._should_include_file(root_path) else []
found_files = []
files_processed = 0
# Get all supported extensions
extensions = set()
for lang in self.language_config.languages:
if lang in self.language_config.file_extensions:
extensions.update(self.language_config.file_extensions[lang])
# Walk through directory
for file_path in root_path.rglob("*"):
if not file_path.is_file():
continue
# Check max files limit
if (
self.path_config.max_files is not None
and files_processed >= self.path_config.max_files
):
break
# Check if file should be included
if self._should_include_file(file_path):
found_files.append(file_path)
files_processed += 1
return found_files
def find_python_files(self, root_path: Path) -> list[Path]:
"""Find only Python files."""
if not root_path.exists():
return []
if root_path.is_file():
return [root_path] if self._is_python_file(root_path) else []
found_files = []
for file_path in root_path.rglob("*.py"):
if self._should_include_file(file_path) and self._is_python_file(file_path):
found_files.append(file_path)
return found_files
def _should_include_file(self, file_path: Path) -> bool:
"""Check if a file should be included in analysis."""
path_str = str(file_path)
# Check exclude patterns first
for pattern in self.path_config.exclude_patterns:
if fnmatch.fnmatch(path_str, pattern) or fnmatch.fnmatch(
file_path.name, pattern
):
return False
# Check include patterns
for pattern in self.path_config.include_patterns:
if fnmatch.fnmatch(path_str, pattern) or fnmatch.fnmatch(
file_path.name, pattern
):
# Check if it's a supported file type
return self._has_supported_extension(file_path)
return False
def _has_supported_extension(self, file_path: Path) -> bool:
"""Check if file has a supported extension."""
suffix = file_path.suffix.lower()
for lang in self.language_config.languages:
if (
lang in self.language_config.file_extensions
and suffix in self.language_config.file_extensions[lang]
):
return True
return False
def _is_python_file(self, file_path: Path) -> bool:
"""Check if file is a Python file."""
return file_path.suffix.lower() in [".py", ".pyx", ".pyi"]
def get_file_language(self, file_path: Path) -> str | None:
"""Determine the programming language of a file."""
suffix = file_path.suffix.lower()
for lang, extensions in self.language_config.file_extensions.items():
if suffix in extensions:
return lang
return None
def get_project_stats(self, root_path: Path) -> dict[str, Any]:
"""Get statistics about files in the project."""
stats = {
"total_files": 0,
"supported_files": 0,
"excluded_files": 0,
"by_language": {},
}
if not root_path.exists():
return stats
# Initialize language counters
for lang in self.language_config.languages:
stats["by_language"][lang] = 0
# Walk through all files
for file_path in root_path.rglob("*"):
if not file_path.is_file():
continue
stats["total_files"] += 1
if self._should_include_file(file_path):
stats["supported_files"] += 1
lang = self.get_file_language(file_path)
if lang and lang in stats["by_language"]:
stats["by_language"][lang] += 1
else:
stats["excluded_files"] += 1
return stats
def filter_files_by_patterns(
self,
files: list[Path],
include_patterns: list[str] | None = None,
exclude_patterns: list[str] | None = None,
) -> list[Path]:
"""Filter files by additional patterns."""
filtered = []
for file_path in files:
path_str = str(file_path)
include = True
# Apply exclude patterns
if exclude_patterns:
for pattern in exclude_patterns:
if fnmatch.fnmatch(path_str, pattern) or fnmatch.fnmatch(
file_path.name, pattern
):
include = False
break
# Apply include patterns
if include and include_patterns:
include = False
for pattern in include_patterns:
if fnmatch.fnmatch(path_str, pattern) or fnmatch.fnmatch(
file_path.name, pattern
):
include = True
break
if include:
filtered.append(file_path)
return filtered
def get_file_size_stats(self, files: list[Path]) -> dict[str, int]:
"""Get file size statistics."""
sizes = []
total_size = 0
total_lines = 0
for file_path in files:
try:
size = file_path.stat().st_size
sizes.append(size)
total_size += size
# Count lines
with open(file_path, encoding="utf-8") as f:
lines = sum(1 for _ in f)
total_lines += lines
except (OSError, UnicodeDecodeError):
continue
if not sizes:
return {
"total_files": 0,
"total_size_bytes": 0,
"total_lines": 0,
"average_size_bytes": 0,
"average_lines_per_file": 0,
}
return {
"total_files": len(sizes),
"total_size_bytes": total_size,
"total_lines": total_lines,
"average_size_bytes": total_size // len(sizes),
"average_lines_per_file": total_lines // len(sizes),
"largest_file_bytes": max(sizes),
"smallest_file_bytes": min(sizes),
}