feat: implement code quality analysis toolkit with modernization, complexity and duplication detection
This commit is contained in:
161
.gitignore
vendored
Normal file
161
.gitignore
vendored
Normal file
@@ -0,0 +1,161 @@
|
||||
# Byte-compiled / optimized / DLL files
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
*$py.class
|
||||
|
||||
# C extensions
|
||||
*.so
|
||||
|
||||
# Distribution / packaging
|
||||
.Python
|
||||
build/
|
||||
develop-eggs/
|
||||
dist/
|
||||
downloads/
|
||||
eggs/
|
||||
.eggs/
|
||||
lib/
|
||||
lib64/
|
||||
parts/
|
||||
sdist/
|
||||
var/
|
||||
wheels/
|
||||
*.egg-info/
|
||||
.installed.cfg
|
||||
*.egg
|
||||
MANIFEST
|
||||
|
||||
# PyInstaller
|
||||
# Usually these files are written by a python script from a template
|
||||
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
||||
*.manifest
|
||||
*.spec
|
||||
|
||||
# Installer logs
|
||||
pip-log.txt
|
||||
pip-delete-this-directory.txt
|
||||
|
||||
# Unit test / coverage reports
|
||||
htmlcov/
|
||||
.tox/
|
||||
.coverage
|
||||
.coverage.*
|
||||
.cache
|
||||
nosetests.xml
|
||||
coverage.xml
|
||||
*.cover
|
||||
.hypothesis/
|
||||
.pytest_cache/
|
||||
|
||||
# Translations
|
||||
*.mo
|
||||
*.pot
|
||||
|
||||
# Django stuff:
|
||||
*.log
|
||||
local_settings.py
|
||||
db.sqlite3
|
||||
|
||||
# Flask stuff:
|
||||
instance/
|
||||
.webassets-cache
|
||||
|
||||
# Scrapy stuff:
|
||||
.scrapy
|
||||
|
||||
# Sphinx documentation
|
||||
docs/_build/
|
||||
|
||||
# PyBuilder
|
||||
target/
|
||||
|
||||
# Jupyter Notebook
|
||||
.ipynb_checkpoints
|
||||
|
||||
# pyenv
|
||||
.python-version
|
||||
|
||||
# celery beat schedule file
|
||||
celerybeat-schedule
|
||||
|
||||
# SageMath parsed files
|
||||
*.sage.py
|
||||
|
||||
# Environments
|
||||
.env
|
||||
.venv
|
||||
env/
|
||||
venv/
|
||||
ENV/
|
||||
env.bak/
|
||||
venv.bak/
|
||||
|
||||
# Spyder project settings
|
||||
.spyderproject
|
||||
.spyproject
|
||||
|
||||
# Rope project settings
|
||||
.ropeproject
|
||||
|
||||
# mkdocs documentation
|
||||
/site
|
||||
|
||||
# mypy
|
||||
.mypy_cache/
|
||||
.dmypy.json
|
||||
dmypy.json
|
||||
|
||||
# Pyre type checker
|
||||
.pyre/
|
||||
|
||||
# IDE specific files
|
||||
.vscode/
|
||||
.idea/
|
||||
*.swp
|
||||
*.swo
|
||||
*~
|
||||
|
||||
# OS specific files
|
||||
.DS_Store
|
||||
.DS_Store?
|
||||
._*
|
||||
.Spotlight-V100
|
||||
.Trashes
|
||||
ehthumbs.db
|
||||
Thumbs.db
|
||||
|
||||
# Temporary files
|
||||
*.tmp
|
||||
*.temp
|
||||
*.bak
|
||||
*.backup
|
||||
|
||||
# Log files
|
||||
*.log
|
||||
|
||||
# Database files
|
||||
*.db
|
||||
*.sqlite
|
||||
*.sqlite3
|
||||
|
||||
# Configuration files with secrets
|
||||
.env.local
|
||||
.env.production
|
||||
config.local.yaml
|
||||
secrets.yaml
|
||||
|
||||
# UV specific
|
||||
.uv_cache/
|
||||
|
||||
# Ruff cache
|
||||
.ruff_cache/
|
||||
|
||||
# Test artifacts
|
||||
.coverage
|
||||
.pytest_cache/
|
||||
htmlcov/
|
||||
|
||||
# Build artifacts
|
||||
dist/
|
||||
build/
|
||||
*.egg-info/
|
||||
21
LICENSE
Normal file
21
LICENSE
Normal file
@@ -0,0 +1,21 @@
|
||||
MIT License
|
||||
|
||||
Copyright (c) 2024 Your Name
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
||||
61
pyproject.toml
Normal file
61
pyproject.toml
Normal file
@@ -0,0 +1,61 @@
|
||||
[build-system]
|
||||
requires = ["hatchling"]
|
||||
build-backend = "hatchling.build"
|
||||
|
||||
[project]
|
||||
name = "claude-scripts"
|
||||
version = "0.1.0"
|
||||
description = "A comprehensive Python code quality analysis toolkit for detecting duplicates, complexity metrics, and modernization opportunities"
|
||||
authors = [{name = "Your Name", email = "your.email@example.com"}]
|
||||
readme = "README.md"
|
||||
license = {file = "LICENSE"}
|
||||
requires-python = ">=3.12"
|
||||
classifiers = [
|
||||
"Development Status :: 4 - Beta",
|
||||
"Intended Audience :: Developers",
|
||||
"License :: OSI Approved :: MIT License",
|
||||
"Operating System :: OS Independent",
|
||||
"Programming Language :: Python :: 3",
|
||||
"Programming Language :: Python :: 3.12",
|
||||
"Programming Language :: Python :: 3.13",
|
||||
"Topic :: Software Development :: Quality Assurance",
|
||||
"Topic :: Software Development :: Libraries :: Python Modules",
|
||||
]
|
||||
keywords = ["code-quality", "static-analysis", "duplicate-detection", "complexity", "refactoring"]
|
||||
dependencies = [
|
||||
"click>=8.0.0",
|
||||
"pyyaml>=6.0",
|
||||
"pydantic>=2.0.0",
|
||||
"radon>=6.0.0",
|
||||
]
|
||||
|
||||
[project.optional-dependencies]
|
||||
dev = [
|
||||
"pytest>=7.0.0",
|
||||
"pytest-cov>=4.0.0",
|
||||
"ruff>=0.1.0",
|
||||
"mypy>=1.5.0",
|
||||
"pre-commit>=3.0.0",
|
||||
]
|
||||
|
||||
[project.urls]
|
||||
Homepage = "https://github.com/yourusername/claude-scripts"
|
||||
Repository = "https://github.com/yourusername/claude-scripts"
|
||||
Issues = "https://github.com/yourusername/claude-scripts/issues"
|
||||
Documentation = "https://github.com/yourusername/claude-scripts#readme"
|
||||
|
||||
[project.scripts]
|
||||
claude-quality = "quality.cli.main:cli"
|
||||
|
||||
[tool.hatch.build.targets.sdist]
|
||||
exclude = [
|
||||
"/.github",
|
||||
"/docs",
|
||||
"/.vscode",
|
||||
"/.pytest_cache",
|
||||
"/.mypy_cache",
|
||||
"/.ruff_cache",
|
||||
]
|
||||
|
||||
[tool.hatch.build.targets.wheel]
|
||||
packages = ["src/quality"]
|
||||
131
src/quality/.quality-exceptions.yaml
Normal file
131
src/quality/.quality-exceptions.yaml
Normal file
@@ -0,0 +1,131 @@
|
||||
# Quality Analysis Exceptions Configuration
|
||||
# This file allows you to suppress specific analysis results based on file patterns,
|
||||
# line patterns, issue types, and other criteria.
|
||||
|
||||
exceptions:
|
||||
enabled: true
|
||||
|
||||
# Global file and directory exclusions
|
||||
# These patterns will suppress all issues for matching files/directories
|
||||
exclude_files:
|
||||
- "*/tests/*"
|
||||
- "*/test_*"
|
||||
- "*/__pycache__/*"
|
||||
- "*/migrations/*"
|
||||
- "*/conftest.py"
|
||||
- "*/.pytest_cache/*"
|
||||
|
||||
exclude_directories:
|
||||
- "*/venv/*"
|
||||
- "*/.venv/*"
|
||||
- "*/node_modules/*"
|
||||
- "*/.git/*"
|
||||
- "*/build/*"
|
||||
- "*/dist/*"
|
||||
|
||||
# Specific exception rules
|
||||
rules:
|
||||
# Example: Suppress complexity issues in legacy code
|
||||
- analysis_type: "complexity"
|
||||
issue_type: "high_complexity"
|
||||
file_patterns:
|
||||
- "*/legacy/*"
|
||||
- "*/third_party/*"
|
||||
- "*/vendor/*"
|
||||
reason: "Legacy code with known complexity - migration planned"
|
||||
|
||||
# Example: Allow intentional Pydantic v1 usage in compatibility layers
|
||||
- analysis_type: "modernization"
|
||||
issue_type: "pydantic_v1_pattern"
|
||||
file_patterns:
|
||||
- "*/compatibility/*"
|
||||
- "*/adapters/*"
|
||||
line_patterns:
|
||||
- "# pydantic v1 required"
|
||||
- "# TODO: migrate to v2"
|
||||
- "# legacy compatibility"
|
||||
reason: "Intentional Pydantic v1 usage for compatibility with legacy systems"
|
||||
|
||||
# Example: Suppress typing imports for external compatibility
|
||||
- analysis_type: "modernization"
|
||||
issue_type: "legacy_typing_import"
|
||||
file_patterns:
|
||||
- "*/external/*"
|
||||
- "*/integrations/*"
|
||||
reason: "External library compatibility requirements"
|
||||
|
||||
# Example: Allow duplicates in generated/template code
|
||||
- analysis_type: "duplicates"
|
||||
file_patterns:
|
||||
- "*/templates/*"
|
||||
- "*/generated/*"
|
||||
- "*/auto_generated/*"
|
||||
- "*/schemas/auto/*"
|
||||
reason: "Generated or template code - duplication expected and acceptable"
|
||||
|
||||
# Example: Suppress modernization issues in scripts
|
||||
- analysis_type: "modernization"
|
||||
file_patterns:
|
||||
- "*/scripts/*"
|
||||
- "*/migrations/*"
|
||||
reason: "Scripts and migrations prioritize backward compatibility"
|
||||
|
||||
# Example: Temporary suppression with expiration for gradual refactoring
|
||||
- analysis_type: "complexity"
|
||||
issue_type: "high_complexity"
|
||||
file_patterns:
|
||||
- "*/parsers/*"
|
||||
- "*/processors/*"
|
||||
reason: "Complex parsing logic - refactoring scheduled for Q2 2024"
|
||||
expires: "2024-06-30"
|
||||
enabled: true
|
||||
|
||||
# Example: Suppress specific modernization patterns in test files
|
||||
- analysis_type: "modernization"
|
||||
issue_type: "legacy_typing_import"
|
||||
file_patterns:
|
||||
- "**/test_*.py"
|
||||
- "**/tests/*.py"
|
||||
reason: "Tests may use legacy patterns for compatibility testing"
|
||||
|
||||
# Example: Allow specific duplicates in configuration files
|
||||
- analysis_type: "duplicates"
|
||||
file_patterns:
|
||||
- "*/config/*"
|
||||
- "*/settings/*"
|
||||
line_patterns:
|
||||
- "# duplicate config acceptable"
|
||||
- "# shared configuration"
|
||||
reason: "Configuration files may have intentional duplication"
|
||||
|
||||
# Analysis Types Available:
|
||||
# - "complexity" - Code complexity issues (high cyclomatic/cognitive complexity)
|
||||
# - "duplicates" - Duplicate code detection
|
||||
# - "modernization" - Modern Python pattern suggestions
|
||||
# - "code_smells" - General code smell detection (if implemented)
|
||||
|
||||
# Common Issue Types:
|
||||
# Complexity:
|
||||
# - "high_complexity" - General high complexity
|
||||
# - "cyclomatic_complexity" - High cyclomatic complexity
|
||||
# - "cognitive_complexity" - High cognitive complexity
|
||||
#
|
||||
# Modernization:
|
||||
# - "legacy_typing_import" - from typing import List, Dict, etc.
|
||||
# - "pydantic_v1_pattern" - Pydantic v1 usage patterns
|
||||
# - "old_string_formatting" - % string formatting
|
||||
# - "format_to_fstring" - .format() that could be f-strings
|
||||
# - "unnecessary_object_inheritance" - class Foo(object):
|
||||
#
|
||||
# Duplicates:
|
||||
# - "duplicate_code" - General duplicate code blocks
|
||||
|
||||
# Pattern Syntax:
|
||||
# - file_patterns: Unix shell-style wildcards (*, **, ?, [seq])
|
||||
# - line_patterns: Python regex patterns
|
||||
# - Use "*" for analysis_type to match all analysis types
|
||||
# - Leave issue_type empty to match all issues of that analysis type
|
||||
|
||||
# Expiration Format:
|
||||
# - expires: "YYYY-MM-DD" format
|
||||
# - Rules with past expiration dates are automatically disabled
|
||||
9
src/quality/__init__.py
Normal file
9
src/quality/__init__.py
Normal file
@@ -0,0 +1,9 @@
|
||||
"""Enhanced code quality analysis package."""
|
||||
|
||||
__version__ = "1.0.0"
|
||||
__author__ = "IntelliKit Team"
|
||||
__email__ = "team@intellikit.com"
|
||||
|
||||
# Minimal imports to prevent pre-commit failures
|
||||
# Full imports can be added later when all modules are properly set up
|
||||
__all__ = []
|
||||
1
src/quality/analyzers/__init__.py
Normal file
1
src/quality/analyzers/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
"""Code analyzers for various quality checks."""
|
||||
831
src/quality/analyzers/modernization.py
Normal file
831
src/quality/analyzers/modernization.py
Normal file
@@ -0,0 +1,831 @@
|
||||
"""Modern Python patterns analyzer."""
|
||||
|
||||
import ast
|
||||
import re
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from ..config.schemas import QualityConfig
|
||||
from ..core.exceptions import ExceptionFilter
|
||||
|
||||
|
||||
@dataclass
|
||||
class ModernizationIssue:
|
||||
"""Represents a modernization issue in code."""
|
||||
|
||||
file_path: str
|
||||
line_number: int
|
||||
column: int
|
||||
issue_type: str
|
||||
old_pattern: str
|
||||
suggested_fix: str
|
||||
severity: str # "error", "warning", "info"
|
||||
description: str
|
||||
can_auto_fix: bool = True
|
||||
|
||||
|
||||
class ModernizationAnalyzer(ast.NodeVisitor):
|
||||
"""Analyzes code for non-modern Python patterns."""
|
||||
|
||||
# Types that can be replaced with modern built-in equivalents
|
||||
REPLACEABLE_TYPING_IMPORTS = {
|
||||
"List",
|
||||
"Dict",
|
||||
"Tuple",
|
||||
"Set",
|
||||
"FrozenSet",
|
||||
"Union",
|
||||
"Optional",
|
||||
}
|
||||
|
||||
# Types that moved to collections module but are replaceable
|
||||
COLLECTIONS_TYPING_IMPORTS = {
|
||||
"Deque",
|
||||
"Counter",
|
||||
"DefaultDict",
|
||||
"ChainMap",
|
||||
"OrderedDict",
|
||||
}
|
||||
|
||||
# Types that moved to other modules
|
||||
MOVED_TYPING_IMPORTS = {
|
||||
"Callable": "collections.abc",
|
||||
"Coroutine": "collections.abc",
|
||||
"Awaitable": "collections.abc",
|
||||
"AsyncIterable": "collections.abc",
|
||||
"AsyncIterator": "collections.abc",
|
||||
"Iterable": "collections.abc",
|
||||
"Iterator": "collections.abc",
|
||||
"Generator": "collections.abc",
|
||||
"Hashable": "collections.abc",
|
||||
"Reversible": "collections.abc",
|
||||
"Container": "collections.abc",
|
||||
"Collection": "collections.abc",
|
||||
"Sequence": "collections.abc",
|
||||
"MutableSequence": "collections.abc",
|
||||
"Set": "collections.abc",
|
||||
"MutableSet": "collections.abc",
|
||||
"Mapping": "collections.abc",
|
||||
"MutableMapping": "collections.abc",
|
||||
"Sized": "collections.abc",
|
||||
"Pattern": "re",
|
||||
"Match": "re",
|
||||
}
|
||||
|
||||
# Types that must remain in typing module (no modern replacement)
|
||||
REQUIRED_TYPING_IMPORTS = {
|
||||
"ClassVar",
|
||||
"TypeVar",
|
||||
"Generic",
|
||||
"Protocol",
|
||||
"Final",
|
||||
"Literal",
|
||||
"Type",
|
||||
"TypedDict",
|
||||
"NewType",
|
||||
"NoReturn",
|
||||
"Never",
|
||||
"Self",
|
||||
"Unpack",
|
||||
"TypeAlias",
|
||||
"TypeGuard",
|
||||
"TypeIs",
|
||||
"Annotated",
|
||||
"Any",
|
||||
"overload",
|
||||
"runtime_checkable",
|
||||
"TYPE_CHECKING",
|
||||
}
|
||||
|
||||
# Combined set of all recognized typing imports
|
||||
ALL_TYPING_IMPORTS = (
|
||||
REPLACEABLE_TYPING_IMPORTS
|
||||
| COLLECTIONS_TYPING_IMPORTS
|
||||
| set(MOVED_TYPING_IMPORTS.keys())
|
||||
| REQUIRED_TYPING_IMPORTS
|
||||
)
|
||||
|
||||
# Mapping for truly replaceable types
|
||||
REPLACEABLE_TO_MODERN = {
|
||||
"List": "list",
|
||||
"Dict": "dict",
|
||||
"Tuple": "tuple",
|
||||
"Set": "set",
|
||||
"FrozenSet": "frozenset",
|
||||
"Union": "|",
|
||||
"Optional": "| None",
|
||||
}
|
||||
|
||||
# Mapping for collections types
|
||||
COLLECTIONS_TO_MODERN = {
|
||||
"Deque": "collections.deque",
|
||||
"Counter": "collections.Counter",
|
||||
"DefaultDict": "collections.defaultdict",
|
||||
"ChainMap": "collections.ChainMap",
|
||||
"OrderedDict": "collections.OrderedDict",
|
||||
}
|
||||
|
||||
def __init__(
|
||||
self, file_path: str, content: str, config: QualityConfig | None = None
|
||||
):
|
||||
self.file_path = file_path
|
||||
self.content = content
|
||||
self.content_lines = content.splitlines()
|
||||
self.config = config or QualityConfig()
|
||||
self.issues: list[ModernizationIssue] = []
|
||||
self.imports: dict[str, str] = {} # name -> module
|
||||
self.typing_imports: set[str] = set()
|
||||
self.has_future_annotations = False
|
||||
|
||||
def analyze(self) -> list[ModernizationIssue]:
|
||||
"""Run the modernization analysis."""
|
||||
try:
|
||||
tree = ast.parse(self.content)
|
||||
self.visit(tree)
|
||||
|
||||
# Additional pattern-based checks
|
||||
self._check_string_patterns()
|
||||
self._check_exception_patterns()
|
||||
self._check_super_patterns()
|
||||
|
||||
except SyntaxError:
|
||||
pass # Skip files with syntax errors
|
||||
|
||||
return self.issues
|
||||
|
||||
def visit_ImportFrom(self, node: ast.ImportFrom) -> None:
|
||||
"""Check for typing imports that can be modernized."""
|
||||
if node.module == "typing":
|
||||
for alias in node.names:
|
||||
name = alias.asname or alias.name
|
||||
|
||||
if alias.name in self.ALL_TYPING_IMPORTS:
|
||||
self.typing_imports.add(name)
|
||||
|
||||
# Only flag imports that can be modernized
|
||||
if alias.name in self.REPLACEABLE_TYPING_IMPORTS:
|
||||
self._add_replaceable_typing_import_issue(node, alias.name, name)
|
||||
elif alias.name in self.COLLECTIONS_TYPING_IMPORTS:
|
||||
self._add_collections_typing_import_issue(node, alias.name, name)
|
||||
elif alias.name in self.MOVED_TYPING_IMPORTS:
|
||||
self._add_moved_typing_import_issue(node, alias.name, name)
|
||||
# Note: REQUIRED_TYPING_IMPORTS are not flagged as issues
|
||||
|
||||
elif node.module == "__future__" and any(
|
||||
alias.name == "annotations" for alias in node.names
|
||||
):
|
||||
self.has_future_annotations = True
|
||||
|
||||
# Track all imports for context
|
||||
if node.module:
|
||||
for alias in node.names:
|
||||
name = alias.asname or alias.name
|
||||
if name is not None and node.module is not None:
|
||||
self.imports[name] = node.module
|
||||
|
||||
self.generic_visit(node)
|
||||
|
||||
def visit_Import(self, node: ast.Import) -> None:
|
||||
"""Track regular imports."""
|
||||
for alias in node.names:
|
||||
name = alias.asname or alias.name
|
||||
self.imports[name] = alias.name
|
||||
self.generic_visit(node)
|
||||
|
||||
def visit_Subscript(self, node: ast.Subscript) -> None:
|
||||
"""Check for typing usage in type annotations that can be modernized."""
|
||||
typing_name = None
|
||||
|
||||
if isinstance(node.value, ast.Name) and node.value.id in self.typing_imports:
|
||||
typing_name = node.value.id
|
||||
elif (
|
||||
isinstance(node.value, ast.Attribute)
|
||||
and isinstance(node.value.value, ast.Name)
|
||||
and node.value.value.id == "typing"
|
||||
and node.value.attr in self.ALL_TYPING_IMPORTS
|
||||
):
|
||||
# Handle typing.List, typing.Dict etc.
|
||||
typing_name = node.value.attr
|
||||
|
||||
if typing_name:
|
||||
# Only flag usage of types that can be modernized
|
||||
if typing_name in (
|
||||
self.REPLACEABLE_TYPING_IMPORTS | self.COLLECTIONS_TYPING_IMPORTS
|
||||
):
|
||||
self._add_typing_usage_issue(node, typing_name)
|
||||
elif typing_name in self.MOVED_TYPING_IMPORTS:
|
||||
self._add_moved_typing_usage_issue(node, typing_name)
|
||||
# Note: REQUIRED_TYPING_IMPORTS usage is not flagged
|
||||
|
||||
self.generic_visit(node)
|
||||
|
||||
def visit_BinOp(self, node: ast.BinOp) -> None:
|
||||
"""Check for Union usage that could be modernized."""
|
||||
if isinstance(node.op, ast.BitOr):
|
||||
# This is already modern syntax (X | Y)
|
||||
pass
|
||||
self.generic_visit(node)
|
||||
|
||||
def visit_FunctionDef(self, node: ast.FunctionDef) -> None:
|
||||
"""Check function definitions for modernization opportunities."""
|
||||
# Check for missing return type annotations
|
||||
if not node.returns and not self._is_dunder_method(node.name):
|
||||
self._add_missing_return_type_issue(node)
|
||||
|
||||
# Check for untyped parameters
|
||||
for arg in node.args.args:
|
||||
if not arg.annotation and arg.arg != "self" and arg.arg != "cls":
|
||||
self._add_missing_param_type_issue(node, arg)
|
||||
|
||||
self.generic_visit(node)
|
||||
|
||||
def visit_AsyncFunctionDef(self, node: ast.AsyncFunctionDef) -> None:
|
||||
"""Check async function definitions."""
|
||||
# Same checks as regular functions
|
||||
if not node.returns and not self._is_dunder_method(node.name):
|
||||
self._add_missing_return_type_issue(node)
|
||||
|
||||
for arg in node.args.args:
|
||||
if not arg.annotation and arg.arg != "self" and arg.arg != "cls":
|
||||
self._add_missing_param_type_issue(node, arg)
|
||||
|
||||
self.generic_visit(node)
|
||||
|
||||
def visit_ClassDef(self, node: ast.ClassDef) -> None:
|
||||
"""Check class definitions for modernization opportunities."""
|
||||
# Check if class inherits from object (unnecessary in Python 3)
|
||||
for base in node.bases:
|
||||
if isinstance(base, ast.Name) and base.id == "object":
|
||||
self._add_unnecessary_object_inheritance_issue(node, base)
|
||||
|
||||
self.generic_visit(node)
|
||||
|
||||
def _add_replaceable_typing_import_issue(
|
||||
self, node: ast.ImportFrom, typing_name: str, import_name: str
|
||||
) -> None:
|
||||
"""Add issue for typing import that can be replaced with built-ins."""
|
||||
modern_replacement = self.REPLACEABLE_TO_MODERN[typing_name]
|
||||
|
||||
if typing_name in ["List", "Dict", "Tuple", "Set", "FrozenSet"]:
|
||||
description = f"Use built-in '{modern_replacement}' instead of 'typing.{typing_name}' (Python 3.9+)"
|
||||
severity = "warning"
|
||||
elif typing_name == "Union":
|
||||
description = (
|
||||
"Use '|' union operator instead of 'typing.Union' (Python 3.10+)"
|
||||
)
|
||||
severity = "warning"
|
||||
elif typing_name == "Optional":
|
||||
description = "Use '| None' instead of 'typing.Optional' (Python 3.10+)"
|
||||
severity = "warning"
|
||||
else:
|
||||
description = (
|
||||
f"Use '{modern_replacement}' instead of 'typing.{typing_name}'"
|
||||
)
|
||||
severity = "warning"
|
||||
|
||||
self.issues.append(
|
||||
ModernizationIssue(
|
||||
file_path=self.file_path,
|
||||
line_number=node.lineno,
|
||||
column=node.col_offset,
|
||||
issue_type="replaceable_typing_import",
|
||||
old_pattern=f"from typing import {typing_name}",
|
||||
suggested_fix=f"# Remove this import and use {modern_replacement} directly",
|
||||
severity=severity,
|
||||
description=description,
|
||||
)
|
||||
)
|
||||
|
||||
def _add_collections_typing_import_issue(
|
||||
self, node: ast.ImportFrom, typing_name: str, import_name: str
|
||||
) -> None:
|
||||
"""Add issue for typing import that moved to collections."""
|
||||
self.issues.append(
|
||||
ModernizationIssue(
|
||||
file_path=self.file_path,
|
||||
line_number=node.lineno,
|
||||
column=node.col_offset,
|
||||
issue_type="collections_typing_import",
|
||||
old_pattern=f"from typing import {typing_name}",
|
||||
suggested_fix=f"from collections import {typing_name.lower()}",
|
||||
severity="info",
|
||||
description=f"Use 'from collections import {typing_name.lower()}' instead of 'typing.{typing_name}'",
|
||||
)
|
||||
)
|
||||
|
||||
def _add_moved_typing_import_issue(
|
||||
self, node: ast.ImportFrom, typing_name: str, import_name: str
|
||||
) -> None:
|
||||
"""Add issue for typing import that moved to another module."""
|
||||
target_module = self.MOVED_TYPING_IMPORTS[typing_name]
|
||||
|
||||
self.issues.append(
|
||||
ModernizationIssue(
|
||||
file_path=self.file_path,
|
||||
line_number=node.lineno,
|
||||
column=node.col_offset,
|
||||
issue_type="moved_typing_import",
|
||||
old_pattern=f"from typing import {typing_name}",
|
||||
suggested_fix=f"from {target_module} import {typing_name}",
|
||||
severity="info",
|
||||
description=f"'{typing_name}' moved from 'typing' to '{target_module}' module",
|
||||
)
|
||||
)
|
||||
|
||||
def _add_typing_usage_issue(self, node: ast.Subscript, typing_name: str) -> None:
|
||||
"""Add issue for typing usage that can be modernized."""
|
||||
if typing_name in self.REPLACEABLE_TYPING_IMPORTS:
|
||||
modern_replacement = self.REPLACEABLE_TO_MODERN[typing_name]
|
||||
if typing_name in ["List", "Dict", "Tuple", "Set", "FrozenSet"]:
|
||||
old_pattern = f"{typing_name}[...]"
|
||||
new_pattern = f"{modern_replacement.lower()}[...]"
|
||||
description = f"Use built-in '{modern_replacement}' instead of 'typing.{typing_name}'"
|
||||
severity = "warning"
|
||||
elif typing_name == "Union":
|
||||
old_pattern = "Union[...]"
|
||||
new_pattern = "... | ..."
|
||||
description = "Use '|' union operator instead of 'typing.Union'"
|
||||
severity = "warning"
|
||||
elif typing_name == "Optional":
|
||||
old_pattern = "Optional[...]"
|
||||
new_pattern = "... | None"
|
||||
description = "Use '| None' instead of 'typing.Optional'"
|
||||
severity = "warning"
|
||||
else:
|
||||
return # Skip unknown replaceable types
|
||||
elif typing_name in self.COLLECTIONS_TYPING_IMPORTS:
|
||||
modern_replacement = self.COLLECTIONS_TO_MODERN[typing_name]
|
||||
old_pattern = f"{typing_name}[...]"
|
||||
new_pattern = f"{modern_replacement}[...]"
|
||||
description = (
|
||||
f"Use '{modern_replacement}' instead of 'typing.{typing_name}'"
|
||||
)
|
||||
severity = "info"
|
||||
else:
|
||||
return # Skip unknown types
|
||||
|
||||
self.issues.append(
|
||||
ModernizationIssue(
|
||||
file_path=self.file_path,
|
||||
line_number=node.lineno,
|
||||
column=node.col_offset,
|
||||
issue_type="typing_usage",
|
||||
old_pattern=old_pattern,
|
||||
suggested_fix=new_pattern,
|
||||
severity=severity,
|
||||
description=description,
|
||||
)
|
||||
)
|
||||
|
||||
def _add_moved_typing_usage_issue(
|
||||
self, node: ast.Subscript, typing_name: str
|
||||
) -> None:
|
||||
"""Add issue for typing usage that moved to another module."""
|
||||
target_module = self.MOVED_TYPING_IMPORTS[typing_name]
|
||||
|
||||
self.issues.append(
|
||||
ModernizationIssue(
|
||||
file_path=self.file_path,
|
||||
line_number=node.lineno,
|
||||
column=node.col_offset,
|
||||
issue_type="moved_typing_usage",
|
||||
old_pattern=f"typing.{typing_name}[...]",
|
||||
suggested_fix=f"{target_module}.{typing_name}[...]",
|
||||
severity="info",
|
||||
description=f"Use '{target_module}.{typing_name}' instead of 'typing.{typing_name}'",
|
||||
)
|
||||
)
|
||||
|
||||
def _add_missing_return_type_issue(
|
||||
self, node: ast.FunctionDef | ast.AsyncFunctionDef
|
||||
) -> None:
|
||||
"""Add issue for missing return type annotation."""
|
||||
self.issues.append(
|
||||
ModernizationIssue(
|
||||
file_path=self.file_path,
|
||||
line_number=node.lineno,
|
||||
column=node.col_offset,
|
||||
issue_type="missing_return_type",
|
||||
old_pattern=f"def {node.name}(...)",
|
||||
suggested_fix=f"def {node.name}(...) -> ReturnType",
|
||||
severity="info",
|
||||
description="Consider adding return type annotation for better type safety",
|
||||
can_auto_fix=False,
|
||||
)
|
||||
)
|
||||
|
||||
def _add_missing_param_type_issue(
|
||||
self, node: ast.FunctionDef | ast.AsyncFunctionDef, arg: ast.arg
|
||||
) -> None:
|
||||
"""Add issue for missing parameter type annotation."""
|
||||
self.issues.append(
|
||||
ModernizationIssue(
|
||||
file_path=self.file_path,
|
||||
line_number=node.lineno,
|
||||
column=node.col_offset,
|
||||
issue_type="missing_param_type",
|
||||
old_pattern=f"{arg.arg}",
|
||||
suggested_fix=f"{arg.arg}: ParamType",
|
||||
severity="info",
|
||||
description=f"Consider adding type annotation for parameter '{arg.arg}'",
|
||||
can_auto_fix=False,
|
||||
)
|
||||
)
|
||||
|
||||
def _add_unnecessary_object_inheritance_issue(
|
||||
self, node: ast.ClassDef, base: ast.Name
|
||||
) -> None:
|
||||
"""Add issue for unnecessary object inheritance."""
|
||||
self.issues.append(
|
||||
ModernizationIssue(
|
||||
file_path=self.file_path,
|
||||
line_number=node.lineno,
|
||||
column=node.col_offset,
|
||||
issue_type="unnecessary_object_inheritance",
|
||||
old_pattern=f"class {node.name}(object)",
|
||||
suggested_fix=f"class {node.name}",
|
||||
severity="info",
|
||||
description="Inheriting from 'object' is unnecessary in Python 3",
|
||||
can_auto_fix=True,
|
||||
)
|
||||
)
|
||||
|
||||
def _check_string_patterns(self) -> None:
|
||||
"""Check for old-style string formatting."""
|
||||
for i, line in enumerate(self.content_lines, 1):
|
||||
# Check for % formatting
|
||||
if re.search(r'["\'].*%[sd].*["\'].*%', line):
|
||||
self.issues.append(
|
||||
ModernizationIssue(
|
||||
file_path=self.file_path,
|
||||
line_number=i,
|
||||
column=0,
|
||||
issue_type="old_string_formatting",
|
||||
old_pattern="'...' % (...)",
|
||||
suggested_fix="f'...' or '...'.format(...)",
|
||||
severity="info",
|
||||
description="Consider using f-strings or .format() instead of % formatting",
|
||||
)
|
||||
)
|
||||
|
||||
# Check for .format() that could be f-string
|
||||
if re.search(r'["\'].*\{.*\}.*["\']\.format\(', line):
|
||||
self.issues.append(
|
||||
ModernizationIssue(
|
||||
file_path=self.file_path,
|
||||
line_number=i,
|
||||
column=0,
|
||||
issue_type="format_to_fstring",
|
||||
old_pattern="'...{}'.format(...)",
|
||||
suggested_fix="f'...{...}'",
|
||||
severity="info",
|
||||
description="Consider using f-strings instead of .format() for better readability",
|
||||
)
|
||||
)
|
||||
|
||||
def _check_exception_patterns(self) -> None:
|
||||
"""Check for old-style exception handling."""
|
||||
for i, line in enumerate(self.content_lines, 1):
|
||||
# Check for bare except
|
||||
if re.search(r"except\s*:", line.strip()):
|
||||
self.issues.append(
|
||||
ModernizationIssue(
|
||||
file_path=self.file_path,
|
||||
line_number=i,
|
||||
column=0,
|
||||
issue_type="bare_except",
|
||||
old_pattern="except:",
|
||||
suggested_fix="except Exception:",
|
||||
severity="warning",
|
||||
description="Use specific exception types instead of bare except",
|
||||
)
|
||||
)
|
||||
|
||||
def _check_super_patterns(self) -> None:
|
||||
"""Check for old-style super() calls."""
|
||||
for i, line in enumerate(self.content_lines, 1):
|
||||
# Check for old-style super calls
|
||||
if re.search(r"super\(\s*\w+\s*,\s*self\s*\)", line):
|
||||
self.issues.append(
|
||||
ModernizationIssue(
|
||||
file_path=self.file_path,
|
||||
line_number=i,
|
||||
column=0,
|
||||
issue_type="old_super_call",
|
||||
old_pattern="super(ClassName, self)",
|
||||
suggested_fix="super()",
|
||||
severity="info",
|
||||
description="Use super() without arguments (Python 3+)",
|
||||
)
|
||||
)
|
||||
|
||||
def _is_dunder_method(self, name: str) -> bool:
|
||||
"""Check if method name is a dunder method."""
|
||||
return name.startswith("__") and name.endswith("__")
|
||||
|
||||
|
||||
class PydanticAnalyzer:
|
||||
"""Analyzes Pydantic usage patterns and migration opportunities."""
|
||||
|
||||
V1_PATTERNS = {
|
||||
# Model configuration patterns
|
||||
r"class\s+Config:": "Use model_config instead of Config class (Pydantic v2)",
|
||||
# Field patterns
|
||||
r"Field\([^)]*allow_mutation=": "allow_mutation is deprecated, use frozen instead",
|
||||
r"Field\([^)]*regex=": "regex parameter is deprecated, use pattern instead",
|
||||
r"Field\([^)]*min_length=": "Consider using StringConstraints for string validation",
|
||||
r"Field\([^)]*max_length=": "Consider using StringConstraints for string validation",
|
||||
# Validator patterns
|
||||
r"@validator": "@validator is deprecated, use @field_validator instead",
|
||||
r"@root_validator": "@root_validator is deprecated, use @model_validator instead",
|
||||
r"pre=True": "pre parameter syntax changed in Pydantic v2",
|
||||
# Model methods
|
||||
r"\.dict\(\)": "Use .model_dump() instead of .dict() (Pydantic v2)",
|
||||
r"\.json\(\)": "Use .model_dump_json() instead of .json() (Pydantic v2)",
|
||||
r"\.parse_obj\(": "Use model_validate() instead of parse_obj() (Pydantic v2)",
|
||||
r"\.parse_raw\(": "Use model_validate_json() instead of parse_raw() (Pydantic v2)",
|
||||
r"\.schema\(\)": "Use model_json_schema() instead of schema() (Pydantic v2)",
|
||||
r"\.copy\(\)": "Use model_copy() instead of copy() (Pydantic v2)",
|
||||
# Import patterns
|
||||
r"from pydantic import.*BaseSettings": "BaseSettings moved to pydantic-settings package",
|
||||
}
|
||||
|
||||
# Pydantic v2 methods that should NEVER be flagged as issues when used with model classes
|
||||
V2_METHODS = {
|
||||
"model_validate",
|
||||
"model_validate_json",
|
||||
"model_dump",
|
||||
"model_dump_json",
|
||||
"model_copy",
|
||||
"model_json_schema",
|
||||
"model_rebuild",
|
||||
"model_fields",
|
||||
"model_fields_set",
|
||||
"model_computed_fields",
|
||||
"model_config",
|
||||
"model_extra",
|
||||
}
|
||||
|
||||
INTENTIONAL_V1_CONTEXTS = {
|
||||
# These patterns suggest intentional v1 usage that might be needed
|
||||
"pydantic.v1", # Explicit v1 import
|
||||
"pydantic_v1", # Common alias for v1
|
||||
"__pydantic_model__", # v1 compatibility marker
|
||||
"model_rebuild", # Sometimes used in migration contexts
|
||||
"# pydantic v1", # Comment indicating intentional v1 usage
|
||||
"# TODO: migrate", # Comment indicating planned migration
|
||||
}
|
||||
|
||||
def __init__(self, file_path: str, content: str):
|
||||
self.file_path = file_path
|
||||
self.content = content
|
||||
self.content_lines = content.splitlines()
|
||||
self.issues: list[ModernizationIssue] = []
|
||||
|
||||
def analyze(self) -> list[ModernizationIssue]:
|
||||
"""Analyze Pydantic usage patterns."""
|
||||
has_pydantic_import = self._has_pydantic_import()
|
||||
if not has_pydantic_import:
|
||||
return []
|
||||
|
||||
# Check if this looks like intentional v1 usage
|
||||
is_intentional_v1 = self._is_intentional_v1_usage()
|
||||
|
||||
for i, line in enumerate(self.content_lines, 1):
|
||||
# Skip lines that contain valid Pydantic v2 patterns
|
||||
if self._is_valid_v2_pattern(line):
|
||||
continue
|
||||
|
||||
for pattern, description in self.V1_PATTERNS.items():
|
||||
if re.search(pattern, line):
|
||||
severity = "info" if is_intentional_v1 else "warning"
|
||||
|
||||
# Determine suggested fix based on pattern
|
||||
suggested_fix = self._get_suggested_fix(pattern, line)
|
||||
|
||||
self.issues.append(
|
||||
ModernizationIssue(
|
||||
file_path=self.file_path,
|
||||
line_number=i,
|
||||
column=0,
|
||||
issue_type="pydantic_v1_pattern",
|
||||
old_pattern=pattern,
|
||||
suggested_fix=suggested_fix,
|
||||
severity=severity,
|
||||
description=description,
|
||||
can_auto_fix=pattern
|
||||
in [r"\.dict\(\)", r"\.json\(\)", r"\.copy\(\)"],
|
||||
)
|
||||
)
|
||||
|
||||
return self.issues
|
||||
|
||||
def _has_pydantic_import(self) -> bool:
|
||||
"""Check if file imports Pydantic."""
|
||||
return any(
|
||||
"pydantic" in line for line in self.content_lines[:20]
|
||||
) # Check first 20 lines
|
||||
|
||||
def _is_intentional_v1_usage(self) -> bool:
|
||||
"""Check if this appears to be intentional v1 usage."""
|
||||
content_lower = self.content.lower()
|
||||
return any(context in content_lower for context in self.INTENTIONAL_V1_CONTEXTS)
|
||||
|
||||
def _is_valid_v2_pattern(self, line: str) -> bool:
|
||||
"""Check if line contains valid Pydantic v2 patterns that should not be flagged."""
|
||||
# Check if line contains any valid v2 methods
|
||||
return any(f".{v2_method}(" in line for v2_method in self.V2_METHODS)
|
||||
|
||||
def _get_suggested_fix(self, pattern: str, line: str) -> str:
|
||||
"""Get suggested fix for a Pydantic pattern."""
|
||||
fixes = {
|
||||
r"\.dict\(\)": line.replace(".dict()", ".model_dump()"),
|
||||
r"\.json\(\)": line.replace(".json()", ".model_dump_json()"),
|
||||
r"\.copy\(\)": line.replace(".copy()", ".model_copy()"),
|
||||
r"@validator": line.replace("@validator", "@field_validator"),
|
||||
r"@root_validator": line.replace("@root_validator", "@model_validator"),
|
||||
}
|
||||
|
||||
for fix_pattern, fix_line in fixes.items():
|
||||
if re.search(fix_pattern, line):
|
||||
return fix_line.strip()
|
||||
|
||||
return "See Pydantic v2 migration guide"
|
||||
|
||||
|
||||
class ModernizationEngine:
|
||||
"""Main engine for running modernization analysis."""
|
||||
|
||||
def __init__(self, config: QualityConfig | None = None):
|
||||
self.config = config or QualityConfig()
|
||||
# Import here to avoid circular imports
|
||||
|
||||
self.exception_filter = ExceptionFilter(self.config)
|
||||
|
||||
def analyze_file(self, file_path: Path) -> list[ModernizationIssue]:
|
||||
"""Analyze a single file for modernization opportunities."""
|
||||
try:
|
||||
with open(file_path, encoding="utf-8") as f:
|
||||
content = f.read()
|
||||
except (OSError, UnicodeDecodeError):
|
||||
return []
|
||||
|
||||
issues = []
|
||||
|
||||
# Python modernization analysis
|
||||
python_analyzer = ModernizationAnalyzer(str(file_path), content, self.config)
|
||||
issues.extend(python_analyzer.analyze())
|
||||
|
||||
# Pydantic analysis
|
||||
pydantic_analyzer = PydanticAnalyzer(str(file_path), content)
|
||||
issues.extend(pydantic_analyzer.analyze())
|
||||
|
||||
return issues
|
||||
|
||||
def analyze_files(
|
||||
self, file_paths: list[Path]
|
||||
) -> dict[Path, list[ModernizationIssue]]:
|
||||
"""Analyze multiple files for modernization opportunities."""
|
||||
results = {}
|
||||
|
||||
for file_path in file_paths:
|
||||
if file_path.suffix.lower() == ".py":
|
||||
issues = self.analyze_file(file_path)
|
||||
|
||||
# Apply exception filtering
|
||||
filtered_issues = self.exception_filter.filter_issues(
|
||||
"modernization",
|
||||
issues,
|
||||
get_file_path_fn=lambda issue: issue.file_path,
|
||||
get_line_number_fn=lambda issue: issue.line_number,
|
||||
get_issue_type_fn=lambda issue: issue.issue_type,
|
||||
get_line_content_fn=lambda issue: self._get_line_content(
|
||||
issue.file_path, issue.line_number
|
||||
),
|
||||
)
|
||||
|
||||
if filtered_issues: # Only include files with remaining issues
|
||||
results[file_path] = filtered_issues
|
||||
|
||||
return results
|
||||
|
||||
def _get_line_content(self, file_path: str, line_number: int) -> str:
|
||||
"""Get the content of a specific line from a file."""
|
||||
try:
|
||||
with open(file_path, encoding="utf-8") as f:
|
||||
lines = f.readlines()
|
||||
if 1 <= line_number <= len(lines):
|
||||
return lines[line_number - 1].strip()
|
||||
except (OSError, UnicodeDecodeError):
|
||||
pass
|
||||
return ""
|
||||
|
||||
def get_summary(
|
||||
self, results: dict[Path, list[ModernizationIssue]]
|
||||
) -> dict[str, Any]:
|
||||
"""Generate summary of modernization analysis."""
|
||||
all_issues = []
|
||||
for issues in results.values():
|
||||
if issues is not None:
|
||||
all_issues.extend(issues)
|
||||
|
||||
# Group by issue type
|
||||
by_type: dict[str, list[ModernizationIssue]] = {}
|
||||
by_severity = {"error": 0, "warning": 0, "info": 0}
|
||||
|
||||
for issue in all_issues:
|
||||
by_type.setdefault(issue.issue_type, []).append(issue)
|
||||
by_severity[issue.severity] += 1
|
||||
|
||||
# Top files with most issues
|
||||
file_counts = {}
|
||||
for file_path, issues in results.items():
|
||||
if issues:
|
||||
file_counts[file_path] = len(issues)
|
||||
|
||||
top_files = sorted(file_counts.items(), key=lambda x: x[1], reverse=True)[:10]
|
||||
|
||||
# Auto-fixable issues
|
||||
auto_fixable = sum(1 for issue in all_issues if issue.can_auto_fix)
|
||||
|
||||
return {
|
||||
"total_files_analyzed": len(results),
|
||||
"files_with_issues": len(
|
||||
[
|
||||
f
|
||||
for f, issues in results.items()
|
||||
if issues is not None and len(issues) > 0
|
||||
]
|
||||
),
|
||||
"total_issues": len(all_issues),
|
||||
"by_severity": by_severity,
|
||||
"by_type": {k: len(v) for k, v in by_type.items()},
|
||||
"auto_fixable_count": auto_fixable,
|
||||
"top_files_with_issues": [(str(f), count) for f, count in top_files],
|
||||
"recommendations": self._generate_recommendations(by_type, by_severity),
|
||||
}
|
||||
|
||||
def _generate_recommendations(
|
||||
self, by_type: dict[str, list[ModernizationIssue]], by_severity: dict[str, int]
|
||||
) -> list[str]:
|
||||
"""Generate recommendations based on analysis results."""
|
||||
recommendations = []
|
||||
|
||||
# Handle new typing import issue types
|
||||
replaceable_count = len(by_type.get("replaceable_typing_import", []))
|
||||
collections_count = len(by_type.get("collections_typing_import", []))
|
||||
moved_count = len(by_type.get("moved_typing_import", []))
|
||||
|
||||
if replaceable_count > 0:
|
||||
recommendations.append(
|
||||
f"🔄 Update {replaceable_count} typing imports to use modern built-in types (Python 3.9+)"
|
||||
)
|
||||
|
||||
if collections_count > 0:
|
||||
recommendations.append(
|
||||
f"📦 Update {collections_count} typing imports to use collections module"
|
||||
)
|
||||
|
||||
if moved_count > 0:
|
||||
recommendations.append(
|
||||
f"🔀 Update {moved_count} typing imports that moved to other modules"
|
||||
)
|
||||
|
||||
# Handle typing usage issues
|
||||
usage_count = len(by_type.get("typing_usage", []))
|
||||
moved_usage_count = len(by_type.get("moved_typing_usage", []))
|
||||
|
||||
if usage_count > 0:
|
||||
recommendations.append(
|
||||
f"⚡ Modernize {usage_count} type annotations to use built-ins or | union syntax"
|
||||
)
|
||||
|
||||
if moved_usage_count > 0:
|
||||
recommendations.append(
|
||||
f"🔀 Update {moved_usage_count} type annotations that moved to other modules"
|
||||
)
|
||||
|
||||
# Keep existing recommendations for other issue types
|
||||
if "pydantic_v1_pattern" in by_type:
|
||||
count = len(by_type["pydantic_v1_pattern"])
|
||||
recommendations.append(f"📦 Migrate {count} Pydantic v1 patterns to v2 API")
|
||||
|
||||
if "old_string_formatting" in by_type:
|
||||
count = len(by_type["old_string_formatting"])
|
||||
recommendations.append(
|
||||
f"✨ Replace {count} old string formatting patterns with f-strings"
|
||||
)
|
||||
|
||||
if "bare_except" in by_type:
|
||||
count = len(by_type["bare_except"])
|
||||
recommendations.append(
|
||||
f"⚠️ Fix {count} bare except clauses for better error handling"
|
||||
)
|
||||
|
||||
if by_severity["warning"] > 10:
|
||||
recommendations.append(
|
||||
f"🚨 Address {by_severity['warning']} warning-level issues for better code quality"
|
||||
)
|
||||
|
||||
return recommendations
|
||||
1
src/quality/cli/__init__.py
Normal file
1
src/quality/cli/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
"""CLI interface for the quality analysis package."""
|
||||
691
src/quality/cli/main.py
Normal file
691
src/quality/cli/main.py
Normal file
@@ -0,0 +1,691 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Main CLI interface for code quality analysis."""
|
||||
|
||||
import ast
|
||||
import csv
|
||||
import json
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
import click
|
||||
|
||||
from ..analyzers.modernization import ModernizationEngine
|
||||
from ..complexity.analyzer import ComplexityAnalyzer
|
||||
from ..config.schemas import QualityConfig, _load_from_yaml, load_config
|
||||
from ..core.ast_analyzer import ASTAnalyzer
|
||||
from ..core.exceptions import create_exceptions_config_template
|
||||
from ..detection.engine import DuplicateDetectionEngine
|
||||
from ..utils.file_finder import FileFinder
|
||||
|
||||
|
||||
@click.group()
|
||||
@click.option(
|
||||
"--config",
|
||||
"-c",
|
||||
type=click.Path(exists=True, path_type=Path),
|
||||
help="Path to configuration file",
|
||||
)
|
||||
@click.option(
|
||||
"--exceptions-file",
|
||||
"-e",
|
||||
type=click.Path(exists=True, path_type=Path),
|
||||
help="Path to exceptions configuration file",
|
||||
)
|
||||
@click.option("--verbose", "-v", is_flag=True, help="Enable verbose output")
|
||||
@click.pass_context
|
||||
def cli(
|
||||
ctx: click.Context, config: Path | None, exceptions_file: Path | None, verbose: bool
|
||||
) -> None:
|
||||
"""Code quality analysis toolkit."""
|
||||
ctx.ensure_object(dict)
|
||||
|
||||
# Load configuration
|
||||
quality_config = load_config(config)
|
||||
quality_config.verbose = verbose
|
||||
|
||||
# Load exceptions configuration if provided
|
||||
if exceptions_file:
|
||||
exceptions_data = _load_from_yaml(exceptions_file)
|
||||
if hasattr(exceptions_data, "exceptions"):
|
||||
quality_config.exceptions = exceptions_data.exceptions
|
||||
|
||||
ctx.obj["config"] = quality_config
|
||||
ctx.obj["verbose"] = verbose
|
||||
|
||||
|
||||
@cli.command()
|
||||
@click.argument(
|
||||
"paths", nargs=-1, required=True, type=click.Path(exists=True, path_type=Path)
|
||||
)
|
||||
@click.option("--threshold", "-t", default=0.8, help="Similarity threshold (0.0-1.0)")
|
||||
@click.option("--min-lines", default=5, help="Minimum lines for duplicate detection")
|
||||
@click.option("--min-tokens", default=50, help="Minimum tokens for duplicate detection")
|
||||
@click.option("--output", "-o", type=click.File("w"), help="Output file for results")
|
||||
@click.option(
|
||||
"--format",
|
||||
"output_format",
|
||||
default="json",
|
||||
type=click.Choice(["json", "console", "csv"]),
|
||||
help="Output format",
|
||||
)
|
||||
@click.pass_context
|
||||
def duplicates(
|
||||
ctx: click.Context,
|
||||
paths: tuple[Path],
|
||||
threshold: float,
|
||||
min_lines: int,
|
||||
min_tokens: int,
|
||||
output: Any,
|
||||
output_format: str,
|
||||
) -> None:
|
||||
"""Detect duplicate code patterns."""
|
||||
config: QualityConfig = ctx.obj["config"]
|
||||
verbose: bool = ctx.obj["verbose"]
|
||||
|
||||
# Update config with CLI options
|
||||
config.detection.similarity_threshold = threshold
|
||||
config.detection.min_lines = min_lines
|
||||
config.detection.min_tokens = min_tokens
|
||||
|
||||
if verbose:
|
||||
click.echo(f"🔍 Analyzing paths: {', '.join(str(p) for p in paths)}")
|
||||
click.echo(f"📊 Similarity threshold: {threshold}")
|
||||
click.echo(f"📏 Min lines: {min_lines}, Min tokens: {min_tokens}")
|
||||
|
||||
# Find Python files
|
||||
file_finder = FileFinder(config.paths, config.languages)
|
||||
all_files = []
|
||||
for path in paths:
|
||||
if path.is_file():
|
||||
all_files.append(path)
|
||||
else:
|
||||
files = file_finder.find_files(path)
|
||||
all_files.extend(files)
|
||||
|
||||
if not all_files:
|
||||
click.echo("❌ No Python files found in the specified paths.", err=True)
|
||||
return
|
||||
|
||||
if verbose:
|
||||
click.echo(f"📂 Found {len(all_files)} Python files")
|
||||
|
||||
# Run duplicate detection
|
||||
engine = DuplicateDetectionEngine(config)
|
||||
duplicates_found = engine.detect_duplicates_in_files(all_files)
|
||||
|
||||
if verbose:
|
||||
click.echo(f"🔍 Found {len(duplicates_found)} duplicate groups")
|
||||
|
||||
# Generate output
|
||||
results: dict[str, Any] = {
|
||||
"summary": {
|
||||
"total_files_analyzed": len(all_files),
|
||||
"duplicate_groups_found": len(duplicates_found),
|
||||
"total_duplicate_blocks": sum(
|
||||
len(match.blocks) for match in duplicates_found
|
||||
),
|
||||
"configuration": {
|
||||
"similarity_threshold": threshold,
|
||||
"min_lines": min_lines,
|
||||
"min_tokens": min_tokens,
|
||||
},
|
||||
},
|
||||
"duplicates": [],
|
||||
}
|
||||
|
||||
for i, match in enumerate(duplicates_found, 1):
|
||||
detailed_analysis = engine.get_detailed_analysis(match)
|
||||
results["duplicates"].append({"group_id": i, "analysis": detailed_analysis})
|
||||
|
||||
# Output results
|
||||
if output_format == "json":
|
||||
if output:
|
||||
json.dump(results, output, indent=2, default=str)
|
||||
else:
|
||||
click.echo(json.dumps(results, indent=2, default=str))
|
||||
elif output_format == "console":
|
||||
_print_console_duplicates(results, verbose)
|
||||
elif output_format == "csv":
|
||||
_print_csv_duplicates(results, output)
|
||||
|
||||
|
||||
@cli.command()
|
||||
@click.argument(
|
||||
"paths", nargs=-1, required=True, type=click.Path(exists=True, path_type=Path)
|
||||
)
|
||||
@click.option("--threshold", default=10, help="Complexity threshold")
|
||||
@click.option("--output", "-o", type=click.File("w"), help="Output file for results")
|
||||
@click.option(
|
||||
"--format",
|
||||
"output_format",
|
||||
default="json",
|
||||
type=click.Choice(["json", "console"]),
|
||||
help="Output format",
|
||||
)
|
||||
@click.pass_context
|
||||
def complexity(
|
||||
ctx: click.Context,
|
||||
paths: tuple[Path],
|
||||
threshold: int,
|
||||
output: Any,
|
||||
output_format: str,
|
||||
) -> None:
|
||||
"""Analyze code complexity."""
|
||||
config: QualityConfig = ctx.obj["config"]
|
||||
verbose: bool = ctx.obj["verbose"]
|
||||
|
||||
config.complexity.complexity_threshold = threshold
|
||||
|
||||
if verbose:
|
||||
click.echo(f"🔍 Analyzing complexity in: {', '.join(str(p) for p in paths)}")
|
||||
click.echo(f"📊 Complexity threshold: {threshold}")
|
||||
|
||||
# Find Python files
|
||||
file_finder = FileFinder(config.paths, config.languages)
|
||||
all_files = []
|
||||
for path in paths:
|
||||
if path.is_file():
|
||||
all_files.append(path)
|
||||
else:
|
||||
files = file_finder.find_files(path)
|
||||
all_files.extend(files)
|
||||
|
||||
if not all_files:
|
||||
click.echo("❌ No Python files found in the specified paths.", err=True)
|
||||
return
|
||||
|
||||
if verbose:
|
||||
click.echo(f"📂 Found {len(all_files)} Python files")
|
||||
|
||||
# Run complexity analysis
|
||||
analyzer = ComplexityAnalyzer(config.complexity)
|
||||
overview = analyzer.get_project_complexity_overview(all_files)
|
||||
|
||||
# Output results
|
||||
if output_format == "json":
|
||||
if output:
|
||||
json.dump(overview, output, indent=2, default=str)
|
||||
else:
|
||||
click.echo(json.dumps(overview, indent=2, default=str))
|
||||
elif output_format == "console":
|
||||
_print_console_complexity(overview, verbose)
|
||||
|
||||
|
||||
@cli.command()
|
||||
@click.argument(
|
||||
"paths", nargs=-1, required=True, type=click.Path(exists=True, path_type=Path)
|
||||
)
|
||||
@click.option(
|
||||
"--include-type-hints", is_flag=True, help="Include missing type hint analysis"
|
||||
)
|
||||
@click.option("--pydantic-only", is_flag=True, help="Only analyze Pydantic patterns")
|
||||
@click.option("--output", "-o", type=click.File("w"), help="Output file for results")
|
||||
@click.option(
|
||||
"--format",
|
||||
"output_format",
|
||||
default="json",
|
||||
type=click.Choice(["json", "console"]),
|
||||
help="Output format",
|
||||
)
|
||||
@click.pass_context
|
||||
def modernization(
|
||||
ctx: click.Context,
|
||||
paths: tuple[Path],
|
||||
include_type_hints: bool,
|
||||
pydantic_only: bool,
|
||||
output: Any,
|
||||
output_format: str,
|
||||
) -> None:
|
||||
"""Analyze code for modernization opportunities."""
|
||||
|
||||
config: QualityConfig = ctx.obj["config"]
|
||||
verbose: bool = ctx.obj["verbose"]
|
||||
|
||||
if verbose:
|
||||
click.echo(
|
||||
f"🔍 Analyzing modernization opportunities in: {', '.join(str(p) for p in paths)}"
|
||||
)
|
||||
if include_type_hints:
|
||||
click.echo("📝 Including type hint analysis")
|
||||
if pydantic_only:
|
||||
click.echo("📦 Pydantic-only analysis mode")
|
||||
|
||||
# Find Python files
|
||||
file_finder = FileFinder(config.paths, config.languages)
|
||||
all_files = []
|
||||
for path in paths:
|
||||
if path.is_file():
|
||||
all_files.append(path)
|
||||
else:
|
||||
files = file_finder.find_files(path)
|
||||
all_files.extend(files)
|
||||
|
||||
if not all_files:
|
||||
click.echo("❌ No Python files found in the specified paths.", err=True)
|
||||
return
|
||||
|
||||
if verbose:
|
||||
click.echo(f"📂 Found {len(all_files)} Python files")
|
||||
|
||||
# Run modernization analysis
|
||||
engine = ModernizationEngine(config)
|
||||
results = engine.analyze_files(all_files)
|
||||
summary = engine.get_summary(results)
|
||||
|
||||
# Filter results if needed
|
||||
if pydantic_only:
|
||||
filtered_results = {}
|
||||
for file_path, issues in results.items():
|
||||
pydantic_issues = [
|
||||
issue for issue in issues if issue.issue_type == "pydantic_v1_pattern"
|
||||
]
|
||||
if pydantic_issues:
|
||||
filtered_results[file_path] = pydantic_issues
|
||||
results = filtered_results
|
||||
|
||||
# Recalculate summary
|
||||
summary = engine.get_summary(results)
|
||||
|
||||
# Output results
|
||||
final_results = {
|
||||
"summary": summary,
|
||||
"files": {
|
||||
str(file_path): [issue.__dict__ for issue in issues]
|
||||
for file_path, issues in results.items()
|
||||
if issues
|
||||
},
|
||||
}
|
||||
|
||||
if output_format == "json":
|
||||
if output:
|
||||
json.dump(final_results, output, indent=2, default=str)
|
||||
else:
|
||||
click.echo(json.dumps(final_results, indent=2, default=str))
|
||||
elif output_format == "console":
|
||||
_print_console_modernization(final_results, verbose, include_type_hints)
|
||||
|
||||
|
||||
@cli.command()
|
||||
@click.argument(
|
||||
"paths", nargs=-1, required=True, type=click.Path(exists=True, path_type=Path)
|
||||
)
|
||||
@click.option("--output", "-o", type=click.File("w"), help="Output file for results")
|
||||
@click.option(
|
||||
"--format",
|
||||
"output_format",
|
||||
default="json",
|
||||
type=click.Choice(["json", "console"]),
|
||||
help="Output format",
|
||||
)
|
||||
@click.pass_context
|
||||
def full_analysis(
|
||||
ctx: click.Context, paths: tuple[Path], output: Any, output_format: str
|
||||
) -> None:
|
||||
"""Run comprehensive code quality analysis."""
|
||||
config: QualityConfig = ctx.obj["config"]
|
||||
verbose: bool = ctx.obj["verbose"]
|
||||
|
||||
if verbose:
|
||||
click.echo(
|
||||
f"🔍 Running full quality analysis on: {', '.join(str(p) for p in paths)}"
|
||||
)
|
||||
|
||||
# Find Python files
|
||||
file_finder = FileFinder(config.paths, config.languages)
|
||||
all_files = []
|
||||
for path in paths:
|
||||
if path.is_file():
|
||||
all_files.append(path)
|
||||
else:
|
||||
files = file_finder.find_files(path)
|
||||
all_files.extend(files)
|
||||
|
||||
if not all_files:
|
||||
click.echo("❌ No Python files found in the specified paths.", err=True)
|
||||
return
|
||||
|
||||
if verbose:
|
||||
click.echo(f"📂 Found {len(all_files)} Python files")
|
||||
|
||||
# Run all analyses
|
||||
results: dict[str, Any] = {
|
||||
"metadata": {
|
||||
"total_files": len(all_files),
|
||||
"analyzed_paths": [str(p) for p in paths],
|
||||
"configuration": config.dict(),
|
||||
}
|
||||
}
|
||||
|
||||
# Complexity analysis
|
||||
if verbose:
|
||||
click.echo("📊 Running complexity analysis...")
|
||||
complexity_analyzer = ComplexityAnalyzer(config.complexity)
|
||||
results["complexity"] = complexity_analyzer.get_project_complexity_overview(
|
||||
all_files
|
||||
)
|
||||
|
||||
# Duplicate detection
|
||||
if verbose:
|
||||
click.echo("🔍 Running duplicate detection...")
|
||||
duplicate_engine = DuplicateDetectionEngine(config)
|
||||
duplicates_found = duplicate_engine.detect_duplicates_in_files(all_files)
|
||||
|
||||
results["duplicates"] = {
|
||||
"summary": {
|
||||
"duplicate_groups_found": len(duplicates_found),
|
||||
"total_duplicate_blocks": sum(
|
||||
len(match.blocks) for match in duplicates_found
|
||||
),
|
||||
},
|
||||
"details": [],
|
||||
}
|
||||
|
||||
for i, match in enumerate(duplicates_found, 1):
|
||||
detailed_analysis = duplicate_engine.get_detailed_analysis(match)
|
||||
duplicate_details = results["duplicates"]["details"]
|
||||
if isinstance(duplicate_details, list):
|
||||
duplicate_details.append({"group_id": i, "analysis": detailed_analysis})
|
||||
|
||||
# Code smells detection
|
||||
if verbose:
|
||||
click.echo("👃 Detecting code smells...")
|
||||
all_smells = []
|
||||
for file_path in all_files:
|
||||
try:
|
||||
with open(file_path, encoding="utf-8") as f:
|
||||
content = f.read()
|
||||
ast_analyzer = ASTAnalyzer(str(file_path), content)
|
||||
# Parse the AST and analyze
|
||||
tree = ast.parse(content)
|
||||
ast_analyzer.visit(tree)
|
||||
smells = ast_analyzer.detect_code_smells()
|
||||
if smells:
|
||||
all_smells.extend(
|
||||
[{"file": str(file_path), "smell": smell} for smell in smells]
|
||||
)
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
results["code_smells"] = {"total_smells": len(all_smells), "details": all_smells}
|
||||
|
||||
# Generate overall quality score
|
||||
results["quality_score"] = _calculate_overall_quality_score(results)
|
||||
|
||||
# Output results
|
||||
if output_format == "json":
|
||||
if output:
|
||||
json.dump(results, output, indent=2, default=str)
|
||||
else:
|
||||
click.echo(json.dumps(results, indent=2, default=str))
|
||||
elif output_format == "console":
|
||||
_print_console_full_analysis(results, verbose)
|
||||
|
||||
|
||||
def _print_console_duplicates(results: dict[str, Any], verbose: bool) -> None:
|
||||
"""Print duplicate results in console format."""
|
||||
summary = results["summary"]
|
||||
|
||||
click.echo("\n🔍 DUPLICATE CODE ANALYSIS")
|
||||
click.echo("=" * 50)
|
||||
click.echo(f"📂 Files analyzed: {summary['total_files_analyzed']}")
|
||||
click.echo(f"🔄 Duplicate groups: {summary['duplicate_groups_found']}")
|
||||
click.echo(f"📊 Total duplicate blocks: {summary['total_duplicate_blocks']}")
|
||||
|
||||
if not results["duplicates"]:
|
||||
click.echo("\n✅ No significant duplicate code patterns found!")
|
||||
return
|
||||
|
||||
click.echo(f"\n🚨 Found {len(results['duplicates'])} duplicate groups:")
|
||||
|
||||
for dup in results["duplicates"]:
|
||||
analysis = dup["analysis"]
|
||||
match_info = analysis["match_info"]
|
||||
|
||||
click.echo(f"\n📋 Group #{dup['group_id']}")
|
||||
click.echo(f" Similarity: {match_info['similarity_score']:.2%}")
|
||||
click.echo(f" Priority: {match_info['priority_score']:.2f}")
|
||||
click.echo(f" Type: {match_info['match_type']}")
|
||||
|
||||
click.echo(" 📁 Affected files:")
|
||||
for block in analysis["blocks"]:
|
||||
click.echo(f" • {block['file_path']} (lines {block['line_range']})")
|
||||
|
||||
if verbose and analysis["refactoring_suggestions"]:
|
||||
click.echo(" 💡 Refactoring suggestions:")
|
||||
for suggestion in analysis["refactoring_suggestions"]:
|
||||
click.echo(f" • {suggestion}")
|
||||
|
||||
|
||||
def _print_csv_duplicates(results: dict[str, Any], output: Any) -> None:
|
||||
"""Print duplicate results in CSV format."""
|
||||
|
||||
if not output:
|
||||
output = sys.stdout
|
||||
|
||||
writer = csv.writer(output)
|
||||
writer.writerow(
|
||||
[
|
||||
"Group ID",
|
||||
"Similarity Score",
|
||||
"Priority Score",
|
||||
"Match Type",
|
||||
"File Path",
|
||||
"Line Range",
|
||||
"Lines of Code",
|
||||
"Estimated Effort",
|
||||
"Risk Level",
|
||||
]
|
||||
)
|
||||
|
||||
for dup in results["duplicates"]:
|
||||
analysis = dup["analysis"]
|
||||
match_info = analysis["match_info"]
|
||||
|
||||
for block in analysis["blocks"]:
|
||||
writer.writerow(
|
||||
[
|
||||
dup["group_id"],
|
||||
f"{match_info['similarity_score']:.2%}",
|
||||
f"{match_info['priority_score']:.2f}",
|
||||
match_info["match_type"],
|
||||
block["file_path"],
|
||||
block["line_range"],
|
||||
block["lines_of_code"],
|
||||
analysis.get("estimated_effort", "Unknown"),
|
||||
analysis.get("risk_assessment", "Unknown"),
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
def _print_console_complexity(results: dict[str, Any], verbose: bool) -> None:
|
||||
"""Print complexity results in console format."""
|
||||
click.echo("\n📊 COMPLEXITY ANALYSIS")
|
||||
click.echo("=" * 50)
|
||||
|
||||
summary = results["summary"]
|
||||
click.echo(f"📂 Total files: {results['total_files']}")
|
||||
click.echo(f"📏 Total lines: {results['total_lines_of_code']}")
|
||||
click.echo(f"⚙️ Total functions: {results['total_functions']}")
|
||||
click.echo(f"🏗️ Total classes: {results['total_classes']}")
|
||||
|
||||
click.echo("\n📈 Average metrics:")
|
||||
click.echo(f" Complexity score: {summary['average_complexity_score']}")
|
||||
click.echo(f" Cyclomatic complexity: {summary['average_cyclomatic_complexity']}")
|
||||
click.echo(f" Maintainability index: {summary['average_maintainability_index']}")
|
||||
|
||||
click.echo("\n📊 Complexity distribution:")
|
||||
for level, count in results["distribution"].items():
|
||||
click.echo(f" {level}: {count} files")
|
||||
|
||||
if results["high_complexity_files"]:
|
||||
click.echo(
|
||||
f"\n🚨 High complexity files (top {len(results['high_complexity_files'])}):"
|
||||
)
|
||||
for file_info in results["high_complexity_files"]:
|
||||
click.echo(
|
||||
f" • {file_info['file']} (score: {file_info['score']:.1f}, level: {file_info['level']})"
|
||||
)
|
||||
|
||||
if results["recommendations"]:
|
||||
click.echo("\n💡 Recommendations:")
|
||||
for rec in results["recommendations"]:
|
||||
click.echo(f" {rec}")
|
||||
|
||||
|
||||
def _print_console_modernization(
|
||||
results: dict[str, Any], verbose: bool, include_type_hints: bool
|
||||
) -> None:
|
||||
"""Print modernization results in console format."""
|
||||
summary = results["summary"]
|
||||
|
||||
click.echo("\n🔄 MODERNIZATION ANALYSIS")
|
||||
click.echo("=" * 50)
|
||||
click.echo(f"📂 Files analyzed: {summary['total_files_analyzed']}")
|
||||
click.echo(f"⚠️ Files with issues: {summary['files_with_issues']}")
|
||||
click.echo(f"🔧 Total issues: {summary['total_issues']}")
|
||||
click.echo(f"✅ Auto-fixable: {summary['auto_fixable_count']}")
|
||||
|
||||
click.echo("\n📊 Issues by severity:")
|
||||
for severity, count in summary["by_severity"].items():
|
||||
if count > 0:
|
||||
icon = (
|
||||
"🚨" if severity == "error" else "⚠️" if severity == "warning" else "ℹ️"
|
||||
)
|
||||
click.echo(f" {icon} {severity.title()}: {count}")
|
||||
|
||||
click.echo("\n📋 Issues by type:")
|
||||
for issue_type, count in summary["by_type"].items():
|
||||
click.echo(f" • {issue_type.replace('_', ' ').title()}: {count}")
|
||||
|
||||
if summary["top_files_with_issues"]:
|
||||
click.echo("\n🗂️ Files with most issues:")
|
||||
for file_path, count in summary["top_files_with_issues"][:5]:
|
||||
click.echo(f" • {file_path}: {count} issues")
|
||||
|
||||
if summary["recommendations"]:
|
||||
click.echo("\n💡 Recommendations:")
|
||||
for rec in summary["recommendations"]:
|
||||
click.echo(f" {rec}")
|
||||
|
||||
if verbose and results["files"]:
|
||||
click.echo("\n📝 Detailed issues:")
|
||||
for file_path, issues in list(results["files"].items())[:5]: # Show top 5 files
|
||||
click.echo(f"\n 📁 {file_path}:")
|
||||
for issue in issues[:3]: # Show first 3 issues per file
|
||||
severity_icon = (
|
||||
"🚨"
|
||||
if issue["severity"] == "error"
|
||||
else "⚠️"
|
||||
if issue["severity"] == "warning"
|
||||
else "ℹ️"
|
||||
)
|
||||
click.echo(
|
||||
f" {severity_icon} Line {issue['line_number']}: {issue['description']}"
|
||||
)
|
||||
if issue["can_auto_fix"]:
|
||||
click.echo(f" 🔧 Suggested fix: {issue['suggested_fix']}")
|
||||
if len(issues) > 3:
|
||||
click.echo(f" ... and {len(issues) - 3} more issues")
|
||||
|
||||
|
||||
def _print_console_full_analysis(results: dict[str, Any], verbose: bool) -> None:
|
||||
"""Print full analysis results in console format."""
|
||||
click.echo("\n🎯 COMPREHENSIVE CODE QUALITY ANALYSIS")
|
||||
click.echo("=" * 60)
|
||||
|
||||
metadata = results["metadata"]
|
||||
click.echo(f"📂 Total files analyzed: {metadata['total_files']}")
|
||||
click.echo(f"📍 Paths: {', '.join(metadata['analyzed_paths'])}")
|
||||
click.echo(f"🎯 Overall quality score: {results['quality_score']:.1f}/100")
|
||||
|
||||
# Complexity summary
|
||||
complexity = results["complexity"]
|
||||
click.echo("\n📊 COMPLEXITY METRICS")
|
||||
click.echo(f" Average score: {complexity['summary']['average_complexity_score']}")
|
||||
click.echo(f" High complexity files: {len(complexity['high_complexity_files'])}")
|
||||
|
||||
# Duplicates summary
|
||||
duplicates = results["duplicates"]
|
||||
click.echo("\n🔄 DUPLICATE DETECTION")
|
||||
click.echo(
|
||||
f" Duplicate groups: {duplicates['summary']['duplicate_groups_found']}"
|
||||
)
|
||||
click.echo(
|
||||
f" Total duplicate blocks: {duplicates['summary']['total_duplicate_blocks']}"
|
||||
)
|
||||
|
||||
# Code smells summary
|
||||
smells = results["code_smells"]
|
||||
click.echo("\n👃 CODE SMELLS")
|
||||
click.echo(f" Total issues: {smells['total_smells']}")
|
||||
|
||||
if verbose and smells["details"]:
|
||||
click.echo(" Details:")
|
||||
for smell in smells["details"][:10]: # Show first 10
|
||||
click.echo(f" • {smell['file']}: {smell['smell']}")
|
||||
if len(smells["details"]) > 10:
|
||||
click.echo(f" ... and {len(smells['details']) - 10} more")
|
||||
|
||||
|
||||
def _calculate_overall_quality_score(results: dict[str, Any]) -> float:
|
||||
"""Calculate an overall quality score based on all metrics."""
|
||||
score = 100.0
|
||||
|
||||
# Complexity penalty (max -30 points)
|
||||
complexity = results["complexity"]
|
||||
avg_complexity = complexity["summary"]["average_complexity_score"]
|
||||
if avg_complexity > 50:
|
||||
score -= min(30, (avg_complexity - 50) * 0.6)
|
||||
|
||||
# Duplicate penalty (max -30 points)
|
||||
duplicates = results["duplicates"]
|
||||
if duplicates["summary"]["duplicate_groups_found"] > 0:
|
||||
penalty = min(30, duplicates["summary"]["duplicate_groups_found"] * 3)
|
||||
score -= penalty
|
||||
|
||||
# Code smells penalty (max -20 points)
|
||||
smells = results["code_smells"]
|
||||
if smells["total_smells"] > 0:
|
||||
penalty = min(20, smells["total_smells"] * 2)
|
||||
score -= penalty
|
||||
|
||||
# Maintainability bonus/penalty (max ±20 points)
|
||||
avg_maintainability = complexity["summary"]["average_maintainability_index"]
|
||||
if avg_maintainability > 70:
|
||||
score += min(20.0, (avg_maintainability - 70) * 0.5)
|
||||
elif avg_maintainability < 30:
|
||||
score -= min(20.0, (30 - avg_maintainability) * 0.5)
|
||||
|
||||
return max(0.0, score)
|
||||
|
||||
|
||||
@cli.command()
|
||||
@click.option(
|
||||
"--output-path",
|
||||
"-o",
|
||||
default=".quality-exceptions.yaml",
|
||||
type=click.Path(path_type=Path),
|
||||
help="Output path for exceptions configuration file",
|
||||
)
|
||||
def create_exceptions_template(output_path: Path) -> None:
|
||||
"""Create a template exceptions configuration file."""
|
||||
|
||||
template_content = create_exceptions_config_template()
|
||||
|
||||
if output_path.exists() and not click.confirm(
|
||||
f"File {output_path} already exists. Overwrite?"
|
||||
):
|
||||
click.echo("Aborted.")
|
||||
return
|
||||
|
||||
with open(output_path, "w", encoding="utf-8") as f:
|
||||
f.write(template_content)
|
||||
|
||||
click.echo(f"✅ Created exceptions configuration template at: {output_path}")
|
||||
click.echo("📝 Edit this file to configure exception rules for your project")
|
||||
click.echo(f"🔧 Use with: --exceptions-file {output_path}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
cli()
|
||||
13
src/quality/complexity/__init__.py
Normal file
13
src/quality/complexity/__init__.py
Normal file
@@ -0,0 +1,13 @@
|
||||
"""Code complexity analysis module."""
|
||||
|
||||
from .analyzer import ComplexityAnalyzer
|
||||
from .calculator import ComplexityCalculator
|
||||
from .metrics import ComplexityMetrics
|
||||
from .radon_integration import RadonComplexityAnalyzer
|
||||
|
||||
__all__ = [
|
||||
"ComplexityAnalyzer",
|
||||
"ComplexityCalculator",
|
||||
"ComplexityMetrics",
|
||||
"RadonComplexityAnalyzer",
|
||||
]
|
||||
311
src/quality/complexity/analyzer.py
Normal file
311
src/quality/complexity/analyzer.py
Normal file
@@ -0,0 +1,311 @@
|
||||
"""High-level complexity analysis interface."""
|
||||
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from .metrics import ComplexityMetrics
|
||||
from .radon_integration import RadonComplexityAnalyzer
|
||||
from ..config.schemas import ComplexityConfig
|
||||
|
||||
# TYPE_CHECKING import to avoid circular imports
|
||||
try:
|
||||
from ..core.exceptions import ExceptionFilter
|
||||
except ImportError:
|
||||
ExceptionFilter = None
|
||||
|
||||
|
||||
class ComplexityAnalyzer:
|
||||
"""High-level interface for code complexity analysis."""
|
||||
|
||||
def __init__(self, config: ComplexityConfig | None = None, full_config: Any = None):
|
||||
self.config = config or ComplexityConfig()
|
||||
self.radon_analyzer = RadonComplexityAnalyzer(fallback_to_manual=True)
|
||||
|
||||
# Initialize exception filter if full config provided
|
||||
if full_config:
|
||||
from ..core.exceptions import ExceptionFilter
|
||||
|
||||
self.exception_filter: ExceptionFilter | None = ExceptionFilter(full_config)
|
||||
else:
|
||||
self.exception_filter: ExceptionFilter | None = None
|
||||
|
||||
def analyze_code(self, code: str, filename: str = "<string>") -> ComplexityMetrics:
|
||||
"""Analyze complexity of code string."""
|
||||
metrics = self.radon_analyzer.analyze_code(code, filename)
|
||||
return self._filter_metrics_by_config(metrics)
|
||||
|
||||
def analyze_file(self, file_path: Path) -> ComplexityMetrics:
|
||||
"""Analyze complexity of a file."""
|
||||
metrics = self.radon_analyzer.analyze_file(file_path)
|
||||
return self._filter_metrics_by_config(metrics)
|
||||
|
||||
def batch_analyze_files(
|
||||
self, file_paths: list[Path], max_workers: int | None = None
|
||||
) -> dict[Path, ComplexityMetrics]:
|
||||
"""Analyze multiple files in parallel."""
|
||||
raw_results = self.radon_analyzer.batch_analyze_files(file_paths, max_workers)
|
||||
|
||||
# Filter metrics based on configuration
|
||||
filtered_results = {}
|
||||
for path, metrics in raw_results.items():
|
||||
filtered_results[path] = self._filter_metrics_by_config(metrics)
|
||||
|
||||
return filtered_results
|
||||
|
||||
def get_complexity_summary(self, metrics: ComplexityMetrics) -> dict[str, Any]:
|
||||
"""Get a human-readable summary of complexity metrics."""
|
||||
return {
|
||||
"overall_score": metrics.get_overall_score(),
|
||||
"complexity_level": metrics.get_complexity_level(),
|
||||
"priority_score": metrics.get_priority_score(),
|
||||
"recommendations": metrics.get_recommendations(),
|
||||
"key_metrics": {
|
||||
"cyclomatic_complexity": metrics.cyclomatic_complexity,
|
||||
"cognitive_complexity": metrics.cognitive_complexity,
|
||||
"maintainability_index": metrics.maintainability_index,
|
||||
"max_nesting_depth": metrics.max_nesting_depth,
|
||||
"lines_of_code": metrics.lines_of_code,
|
||||
"function_count": metrics.function_count,
|
||||
"class_count": metrics.class_count,
|
||||
},
|
||||
"flags": self._get_complexity_flags(metrics),
|
||||
}
|
||||
|
||||
def get_detailed_report(
|
||||
self, code: str, filename: str = "<string>"
|
||||
) -> dict[str, Any]:
|
||||
"""Get detailed complexity report including function-level analysis."""
|
||||
report = self.radon_analyzer.get_detailed_complexity_report(code, filename)
|
||||
|
||||
# Add summary information
|
||||
if "file_metrics" in report:
|
||||
metrics = ComplexityMetrics.from_dict(report["file_metrics"])
|
||||
report["summary"] = self.get_complexity_summary(metrics)
|
||||
|
||||
# Filter functions and classes that exceed thresholds
|
||||
if "functions" in report:
|
||||
report["high_complexity_functions"] = [
|
||||
func
|
||||
for func in report["functions"]
|
||||
if func["complexity"] >= self.config.complexity_threshold
|
||||
]
|
||||
|
||||
return report
|
||||
|
||||
def find_complex_code(
|
||||
self, file_paths: list[Path], max_workers: int | None = None
|
||||
) -> list[dict[str, Any]]:
|
||||
"""Find code blocks that exceed complexity thresholds."""
|
||||
results = self.batch_analyze_files(file_paths, max_workers)
|
||||
complex_files = []
|
||||
|
||||
for path, metrics in results.items():
|
||||
if self._is_complex(metrics):
|
||||
# Check if this complexity issue should be suppressed
|
||||
if self.exception_filter:
|
||||
should_suppress, reason = (
|
||||
self.exception_filter.should_suppress_issue(
|
||||
"complexity", "high_complexity", str(path), 1, ""
|
||||
)
|
||||
)
|
||||
if should_suppress:
|
||||
continue
|
||||
|
||||
summary = self.get_complexity_summary(metrics)
|
||||
complex_files.append(
|
||||
{
|
||||
"file_path": str(path),
|
||||
"metrics": metrics.to_dict(),
|
||||
"summary": summary,
|
||||
"priority": summary["priority_score"],
|
||||
}
|
||||
)
|
||||
|
||||
# Sort by priority (highest first)
|
||||
complex_files.sort(key=lambda x: x["priority"], reverse=True)
|
||||
return complex_files
|
||||
|
||||
def get_project_complexity_overview(
|
||||
self, file_paths: list[Path], max_workers: int | None = None
|
||||
) -> dict[str, Any]:
|
||||
"""Get overall project complexity statistics."""
|
||||
results = self.batch_analyze_files(file_paths, max_workers)
|
||||
|
||||
if not results:
|
||||
return {
|
||||
"total_files": 0,
|
||||
"summary": {},
|
||||
"distribution": {},
|
||||
"recommendations": [],
|
||||
}
|
||||
|
||||
# Aggregate statistics
|
||||
total_files = len(results)
|
||||
total_lines = sum(m.lines_of_code for m in results.values())
|
||||
total_functions = sum(m.function_count for m in results.values())
|
||||
total_classes = sum(m.class_count for m in results.values())
|
||||
|
||||
# Complexity distribution
|
||||
complexity_levels = {
|
||||
"Low": 0,
|
||||
"Moderate": 0,
|
||||
"High": 0,
|
||||
"Very High": 0,
|
||||
"Extreme": 0,
|
||||
}
|
||||
high_complexity_files = []
|
||||
|
||||
for path, metrics in results.items():
|
||||
level = metrics.get_complexity_level()
|
||||
complexity_levels[level] += 1
|
||||
|
||||
if metrics.get_overall_score() >= 50: # High complexity threshold
|
||||
high_complexity_files.append(
|
||||
{
|
||||
"file": str(path),
|
||||
"score": metrics.get_overall_score(),
|
||||
"level": level,
|
||||
}
|
||||
)
|
||||
|
||||
# Sort high complexity files by score
|
||||
high_complexity_files.sort(key=lambda x: x["score"], reverse=True)
|
||||
|
||||
# Project-level recommendations
|
||||
recommendations = []
|
||||
if complexity_levels["Extreme"] > 0:
|
||||
recommendations.append(
|
||||
f"🚨 {complexity_levels['Extreme']} files with extreme complexity need immediate attention"
|
||||
)
|
||||
if complexity_levels["Very High"] > 0:
|
||||
recommendations.append(
|
||||
f"⚠️ {complexity_levels['Very High']} files with very high complexity should be refactored"
|
||||
)
|
||||
if total_files > 0:
|
||||
avg_complexity = (
|
||||
sum(m.get_overall_score() for m in results.values()) / total_files
|
||||
)
|
||||
if avg_complexity > 40:
|
||||
recommendations.append(
|
||||
"📈 Overall project complexity is high - consider architectural improvements"
|
||||
)
|
||||
|
||||
return {
|
||||
"total_files": total_files,
|
||||
"total_lines_of_code": total_lines,
|
||||
"total_functions": total_functions,
|
||||
"total_classes": total_classes,
|
||||
"summary": {
|
||||
"average_complexity_score": round(
|
||||
sum(m.get_overall_score() for m in results.values()) / total_files,
|
||||
2,
|
||||
)
|
||||
if total_files > 0
|
||||
else 0,
|
||||
"average_cyclomatic_complexity": round(
|
||||
sum(m.cyclomatic_complexity for m in results.values())
|
||||
/ total_files,
|
||||
2,
|
||||
)
|
||||
if total_files > 0
|
||||
else 0,
|
||||
"average_maintainability_index": round(
|
||||
sum(m.maintainability_index for m in results.values())
|
||||
/ total_files,
|
||||
2,
|
||||
)
|
||||
if total_files > 0
|
||||
else 0,
|
||||
},
|
||||
"distribution": complexity_levels,
|
||||
"high_complexity_files": high_complexity_files[:10], # Top 10
|
||||
"recommendations": recommendations,
|
||||
"config": {
|
||||
"complexity_threshold": self.config.complexity_threshold,
|
||||
"radon_available": self.radon_analyzer.is_available(),
|
||||
"metrics_included": {
|
||||
"cyclomatic_complexity": self.config.include_cyclomatic,
|
||||
"cognitive_complexity": self.config.include_cognitive,
|
||||
"halstead_metrics": self.config.include_halstead,
|
||||
"maintainability_index": self.config.include_maintainability,
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
def _filter_metrics_by_config(
|
||||
self, metrics: ComplexityMetrics
|
||||
) -> ComplexityMetrics:
|
||||
"""Filter metrics based on configuration settings."""
|
||||
filtered = ComplexityMetrics()
|
||||
|
||||
# Always include basic metrics
|
||||
filtered.lines_of_code = metrics.lines_of_code
|
||||
filtered.source_lines_of_code = metrics.source_lines_of_code
|
||||
filtered.logical_lines_of_code = metrics.logical_lines_of_code
|
||||
filtered.comment_lines = metrics.comment_lines
|
||||
filtered.blank_lines = metrics.blank_lines
|
||||
filtered.function_count = metrics.function_count
|
||||
filtered.class_count = metrics.class_count
|
||||
filtered.method_count = metrics.method_count
|
||||
|
||||
# Include metrics based on configuration
|
||||
if self.config.include_cyclomatic:
|
||||
filtered.cyclomatic_complexity = metrics.cyclomatic_complexity
|
||||
|
||||
if self.config.include_cognitive:
|
||||
filtered.cognitive_complexity = metrics.cognitive_complexity
|
||||
filtered.max_nesting_depth = metrics.max_nesting_depth
|
||||
filtered.average_nesting_depth = metrics.average_nesting_depth
|
||||
|
||||
if self.config.include_halstead:
|
||||
filtered.halstead_difficulty = metrics.halstead_difficulty
|
||||
filtered.halstead_effort = metrics.halstead_effort
|
||||
filtered.halstead_volume = metrics.halstead_volume
|
||||
filtered.halstead_time = metrics.halstead_time
|
||||
filtered.halstead_bugs = metrics.halstead_bugs
|
||||
|
||||
if self.config.include_maintainability:
|
||||
filtered.maintainability_index = metrics.maintainability_index
|
||||
|
||||
# Additional metrics
|
||||
filtered.parameters_count = metrics.parameters_count
|
||||
filtered.variables_count = metrics.variables_count
|
||||
filtered.returns_count = metrics.returns_count
|
||||
|
||||
return filtered
|
||||
|
||||
def _is_complex(self, metrics: ComplexityMetrics) -> bool:
|
||||
"""Check if code is considered complex based on thresholds."""
|
||||
return (
|
||||
metrics.cyclomatic_complexity >= self.config.complexity_threshold
|
||||
or metrics.cognitive_complexity >= self.config.complexity_threshold * 1.5
|
||||
or metrics.max_nesting_depth > 4
|
||||
or metrics.maintainability_index < 20
|
||||
)
|
||||
|
||||
def _get_complexity_flags(self, metrics: ComplexityMetrics) -> list[str]:
|
||||
"""Get list of complexity warning flags."""
|
||||
flags = []
|
||||
|
||||
if metrics.cyclomatic_complexity > self.config.complexity_threshold:
|
||||
flags.append("HIGH_CYCLOMATIC_COMPLEXITY")
|
||||
|
||||
if metrics.cognitive_complexity > self.config.complexity_threshold * 1.5:
|
||||
flags.append("HIGH_COGNITIVE_COMPLEXITY")
|
||||
|
||||
if metrics.max_nesting_depth > 4:
|
||||
flags.append("DEEP_NESTING")
|
||||
|
||||
if metrics.maintainability_index < 20:
|
||||
flags.append("LOW_MAINTAINABILITY")
|
||||
|
||||
if metrics.halstead_difficulty > 20:
|
||||
flags.append("HIGH_HALSTEAD_DIFFICULTY")
|
||||
|
||||
if metrics.function_count == 0 and metrics.lines_of_code > 50:
|
||||
flags.append("LARGE_MONOLITHIC_CODE")
|
||||
|
||||
if metrics.parameters_count > 5:
|
||||
flags.append("TOO_MANY_PARAMETERS")
|
||||
|
||||
return flags
|
||||
358
src/quality/complexity/calculator.py
Normal file
358
src/quality/complexity/calculator.py
Normal file
@@ -0,0 +1,358 @@
|
||||
"""Manual complexity calculation algorithms."""
|
||||
|
||||
import ast
|
||||
import re
|
||||
from collections import Counter
|
||||
|
||||
from .metrics import ComplexityMetrics
|
||||
|
||||
|
||||
class ComplexityCalculator:
|
||||
"""Manual complexity calculator using AST analysis."""
|
||||
|
||||
def calculate_complexity(self, code: str) -> ComplexityMetrics:
|
||||
"""Calculate all complexity metrics for given code."""
|
||||
try:
|
||||
tree = ast.parse(code)
|
||||
return self._analyze_ast(tree, code)
|
||||
except SyntaxError:
|
||||
# Return basic metrics for malformed code
|
||||
return self._analyze_text_metrics(code)
|
||||
|
||||
def _analyze_ast(self, tree: ast.AST, code: str) -> ComplexityMetrics:
|
||||
"""Analyze AST to extract complexity metrics."""
|
||||
metrics = ComplexityMetrics()
|
||||
|
||||
# Basic line counts
|
||||
lines = code.split("\n")
|
||||
metrics.lines_of_code = len(lines)
|
||||
metrics.blank_lines = len([line for line in lines if not line.strip()])
|
||||
metrics.comment_lines = len(
|
||||
[line for line in lines if line.strip().startswith("#")]
|
||||
)
|
||||
metrics.source_lines_of_code = (
|
||||
metrics.lines_of_code - metrics.blank_lines - metrics.comment_lines
|
||||
)
|
||||
|
||||
# AST-based metrics
|
||||
for node in ast.walk(tree):
|
||||
if isinstance(node, ast.FunctionDef):
|
||||
metrics.function_count += 1
|
||||
# Count parameters
|
||||
metrics.parameters_count += len(node.args.args)
|
||||
# Count returns
|
||||
metrics.returns_count += len(
|
||||
[n for n in ast.walk(node) if isinstance(n, ast.Return)]
|
||||
)
|
||||
elif isinstance(node, ast.ClassDef):
|
||||
metrics.class_count += 1
|
||||
elif isinstance(node, ast.AsyncFunctionDef):
|
||||
metrics.function_count += 1
|
||||
metrics.parameters_count += len(node.args.args)
|
||||
metrics.returns_count += len(
|
||||
[n for n in ast.walk(node) if isinstance(n, ast.Return)]
|
||||
)
|
||||
|
||||
# Calculate cyclomatic complexity
|
||||
metrics.cyclomatic_complexity = self._calculate_cyclomatic_complexity(tree)
|
||||
|
||||
# Calculate cognitive complexity
|
||||
metrics.cognitive_complexity = self._calculate_cognitive_complexity(tree)
|
||||
|
||||
# Calculate nesting metrics
|
||||
metrics.max_nesting_depth, metrics.average_nesting_depth = (
|
||||
self._calculate_nesting_metrics(tree)
|
||||
)
|
||||
|
||||
# Calculate Halstead metrics
|
||||
halstead = self._calculate_halstead_metrics(tree)
|
||||
metrics.halstead_difficulty = halstead.get("difficulty", 0.0)
|
||||
metrics.halstead_volume = halstead.get("volume", 0.0)
|
||||
metrics.halstead_effort = halstead.get("effort", 0.0)
|
||||
metrics.halstead_time = halstead.get("time", 0.0)
|
||||
metrics.halstead_bugs = halstead.get("bugs", 0.0)
|
||||
|
||||
# Calculate maintainability index
|
||||
metrics.maintainability_index = self._calculate_maintainability_index(metrics)
|
||||
|
||||
# Logical lines of code (non-empty, non-comment)
|
||||
metrics.logical_lines_of_code = self._count_logical_lines(tree)
|
||||
|
||||
# Count variables
|
||||
metrics.variables_count = self._count_variables(tree)
|
||||
|
||||
# Count methods in classes
|
||||
metrics.method_count = self._count_methods(tree)
|
||||
|
||||
return metrics
|
||||
|
||||
def _analyze_text_metrics(self, code: str) -> ComplexityMetrics:
|
||||
"""Fallback text-based analysis for malformed code."""
|
||||
metrics = ComplexityMetrics()
|
||||
|
||||
lines = code.split("\n")
|
||||
metrics.lines_of_code = len(lines)
|
||||
metrics.blank_lines = len([line for line in lines if not line.strip()])
|
||||
metrics.comment_lines = len(
|
||||
[line for line in lines if line.strip().startswith("#")]
|
||||
)
|
||||
metrics.source_lines_of_code = (
|
||||
metrics.lines_of_code - metrics.blank_lines - metrics.comment_lines
|
||||
)
|
||||
|
||||
# Basic pattern matching
|
||||
metrics.function_count = len(re.findall(r"^\s*def\s+\w+", code, re.MULTILINE))
|
||||
metrics.class_count = len(re.findall(r"^\s*class\s+\w+", code, re.MULTILINE))
|
||||
|
||||
return metrics
|
||||
|
||||
def _calculate_cyclomatic_complexity(self, tree: ast.AST) -> int:
|
||||
"""Calculate McCabe cyclomatic complexity."""
|
||||
complexity = 1 # Base complexity
|
||||
|
||||
for node in ast.walk(tree):
|
||||
if isinstance(
|
||||
node,
|
||||
(
|
||||
ast.If,
|
||||
ast.While,
|
||||
ast.For,
|
||||
ast.AsyncFor,
|
||||
ast.ExceptHandler,
|
||||
ast.With,
|
||||
ast.Assert,
|
||||
),
|
||||
):
|
||||
complexity += 1
|
||||
elif isinstance(node, ast.BoolOp):
|
||||
# Add complexity for boolean operations (and, or)
|
||||
complexity += len(node.values) - 1
|
||||
elif isinstance(node, ast.Expr) and isinstance(node.value, ast.IfExp):
|
||||
# Ternary operator
|
||||
complexity += 1
|
||||
|
||||
return complexity
|
||||
|
||||
def _calculate_cognitive_complexity(self, tree: ast.AST) -> int:
|
||||
"""Calculate cognitive complexity (similar to SonarQube)."""
|
||||
complexity = 0
|
||||
|
||||
def visit_node(node: ast.AST, depth: int = 0) -> int:
|
||||
nonlocal complexity
|
||||
local_complexity = 0
|
||||
|
||||
if isinstance(
|
||||
node,
|
||||
(
|
||||
ast.If,
|
||||
ast.While,
|
||||
ast.For,
|
||||
ast.AsyncFor,
|
||||
ast.ExceptHandler,
|
||||
ast.With,
|
||||
),
|
||||
):
|
||||
local_complexity += 1 + depth
|
||||
elif isinstance(node, ast.BoolOp):
|
||||
# Logical operators add complexity
|
||||
local_complexity += len(node.values) - 1
|
||||
elif (
|
||||
isinstance(node, ast.Lambda)
|
||||
or isinstance(node, ast.Expr)
|
||||
and isinstance(node.value, ast.IfExp)
|
||||
):
|
||||
local_complexity += 1
|
||||
|
||||
complexity += local_complexity
|
||||
|
||||
# Increase nesting for control structures
|
||||
new_depth = (
|
||||
depth + 1
|
||||
if isinstance(
|
||||
node,
|
||||
(
|
||||
ast.If,
|
||||
ast.While,
|
||||
ast.For,
|
||||
ast.AsyncFor,
|
||||
ast.ExceptHandler,
|
||||
ast.With,
|
||||
),
|
||||
)
|
||||
else depth
|
||||
)
|
||||
|
||||
# Recursively visit children
|
||||
for child in ast.iter_child_nodes(node):
|
||||
visit_node(child, new_depth)
|
||||
|
||||
return complexity
|
||||
|
||||
visit_node(tree)
|
||||
return complexity
|
||||
|
||||
def _calculate_nesting_metrics(self, tree: ast.AST) -> tuple[int, float]:
|
||||
"""Calculate nesting depth metrics."""
|
||||
depths = []
|
||||
|
||||
def visit_node(node: ast.AST, depth: int = 0) -> None:
|
||||
current_depth = depth
|
||||
|
||||
if isinstance(
|
||||
node, (ast.If, ast.While, ast.For, ast.AsyncFor, ast.With, ast.Try)
|
||||
):
|
||||
current_depth += 1
|
||||
depths.append(current_depth)
|
||||
|
||||
for child in ast.iter_child_nodes(node):
|
||||
visit_node(child, current_depth)
|
||||
|
||||
visit_node(tree)
|
||||
|
||||
max_depth = max(depths) if depths else 0
|
||||
avg_depth = sum(depths) / len(depths) if depths else 0.0
|
||||
|
||||
return max_depth, round(avg_depth, 2)
|
||||
|
||||
def _calculate_halstead_metrics(self, tree: ast.AST) -> dict[str, float]:
|
||||
"""Calculate Halstead complexity metrics."""
|
||||
operators = Counter()
|
||||
operands = Counter()
|
||||
|
||||
for node in ast.walk(tree):
|
||||
# Operators
|
||||
if isinstance(node, (ast.BinOp, ast.UnaryOp)):
|
||||
operators[type(node.op).__name__] += 1
|
||||
elif isinstance(node, ast.Compare):
|
||||
for op in node.ops:
|
||||
operators[type(op).__name__] += 1
|
||||
elif isinstance(node, ast.BoolOp):
|
||||
operators[type(node.op).__name__] += 1
|
||||
elif isinstance(node, (ast.If, ast.While, ast.For, ast.AsyncFor)):
|
||||
operators["control"] += 1
|
||||
elif isinstance(node, ast.Call):
|
||||
operators["call"] += 1
|
||||
elif isinstance(node, (ast.Assign, ast.AugAssign)):
|
||||
operators["assign"] += 1
|
||||
|
||||
# Operands
|
||||
if isinstance(node, ast.Name):
|
||||
operands[node.id] += 1
|
||||
elif isinstance(node, ast.Constant):
|
||||
operands[str(node.value)] += 1
|
||||
elif isinstance(node, ast.Attribute):
|
||||
operands[node.attr] += 1
|
||||
|
||||
# Halstead metrics
|
||||
n1 = len(operators) # Number of unique operators
|
||||
n2 = len(operands) # Number of unique operands
|
||||
N1 = sum(operators.values()) # Total operators
|
||||
N2 = sum(operands.values()) # Total operands
|
||||
|
||||
vocabulary = n1 + n2
|
||||
length = N1 + N2
|
||||
|
||||
if n2 == 0:
|
||||
return {
|
||||
"difficulty": 0.0,
|
||||
"volume": 0.0,
|
||||
"effort": 0.0,
|
||||
"time": 0.0,
|
||||
"bugs": 0.0,
|
||||
}
|
||||
|
||||
# Prevent division by zero and invalid log
|
||||
if vocabulary <= 1:
|
||||
volume = 0.0
|
||||
else:
|
||||
import math
|
||||
|
||||
volume = length * math.log2(vocabulary)
|
||||
|
||||
difficulty = (n1 / 2) * (N2 / n2) if n2 > 0 else 0.0
|
||||
effort = difficulty * volume
|
||||
time = effort / 18 # Seconds
|
||||
bugs = volume / 3000 # Delivered bugs estimation
|
||||
|
||||
return {
|
||||
"difficulty": round(difficulty, 2),
|
||||
"volume": round(volume, 2),
|
||||
"effort": round(effort, 2),
|
||||
"time": round(time, 2),
|
||||
"bugs": round(bugs, 4),
|
||||
}
|
||||
|
||||
def _calculate_maintainability_index(self, metrics: ComplexityMetrics) -> float:
|
||||
"""Calculate maintainability index."""
|
||||
import math
|
||||
|
||||
# Original Microsoft formula adapted
|
||||
# MI = 171 - 5.2 * ln(HV) - 0.23 * CC - 16.2 * ln(LOC)
|
||||
# Where HV = Halstead Volume, CC = Cyclomatic Complexity, LOC = Lines of Code
|
||||
|
||||
if metrics.halstead_volume <= 0 or metrics.source_lines_of_code <= 0:
|
||||
return 100.0 # Default high maintainability for simple code
|
||||
|
||||
try:
|
||||
mi: float = (
|
||||
171
|
||||
- 5.2 * math.log(metrics.halstead_volume)
|
||||
- 0.23 * metrics.cyclomatic_complexity
|
||||
- 16.2 * math.log(metrics.source_lines_of_code)
|
||||
)
|
||||
|
||||
# Normalize to 0-100 scale
|
||||
mi = max(0.0, min(100.0, mi))
|
||||
return round(mi, 2)
|
||||
except (ValueError, ZeroDivisionError):
|
||||
return 50.0 # Default moderate maintainability
|
||||
|
||||
def _count_logical_lines(self, tree: ast.AST) -> int:
|
||||
"""Count logical lines of code (AST nodes that represent statements)."""
|
||||
count = 0
|
||||
|
||||
for node in ast.walk(tree):
|
||||
if isinstance(
|
||||
node,
|
||||
ast.Assign
|
||||
| ast.AugAssign
|
||||
| ast.Return
|
||||
| ast.Yield
|
||||
| ast.YieldFrom
|
||||
| ast.Expr
|
||||
| ast.Import
|
||||
| ast.ImportFrom
|
||||
| ast.Pass
|
||||
| ast.Break
|
||||
| ast.Continue
|
||||
| ast.Global
|
||||
| ast.Nonlocal
|
||||
| ast.Assert,
|
||||
):
|
||||
count += 1
|
||||
|
||||
return count
|
||||
|
||||
def _count_variables(self, tree: ast.AST) -> int:
|
||||
"""Count unique variable names."""
|
||||
variables = set()
|
||||
|
||||
for node in ast.walk(tree):
|
||||
if isinstance(node, ast.Name) and isinstance(
|
||||
node.ctx, (ast.Store, ast.Del)
|
||||
):
|
||||
variables.add(node.id)
|
||||
|
||||
return len(variables)
|
||||
|
||||
def _count_methods(self, tree: ast.AST) -> int:
|
||||
"""Count methods inside classes."""
|
||||
method_count = 0
|
||||
|
||||
for node in ast.walk(tree):
|
||||
if isinstance(node, ast.ClassDef):
|
||||
for child in node.body:
|
||||
if isinstance(child, (ast.FunctionDef, ast.AsyncFunctionDef)):
|
||||
method_count += 1
|
||||
|
||||
return method_count
|
||||
186
src/quality/complexity/metrics.py
Normal file
186
src/quality/complexity/metrics.py
Normal file
@@ -0,0 +1,186 @@
|
||||
"""Complexity metrics data structures and calculations."""
|
||||
|
||||
from dataclasses import dataclass
|
||||
from typing import Any
|
||||
|
||||
|
||||
@dataclass
|
||||
class ComplexityMetrics:
|
||||
"""Container for various complexity metrics."""
|
||||
|
||||
# Cyclomatic complexity
|
||||
cyclomatic_complexity: int = 0
|
||||
|
||||
# Cognitive complexity
|
||||
cognitive_complexity: int = 0
|
||||
|
||||
# Halstead metrics
|
||||
halstead_difficulty: float = 0.0
|
||||
halstead_effort: float = 0.0
|
||||
halstead_volume: float = 0.0
|
||||
halstead_time: float = 0.0
|
||||
halstead_bugs: float = 0.0
|
||||
|
||||
# Maintainability index
|
||||
maintainability_index: float = 0.0
|
||||
|
||||
# Raw metrics
|
||||
lines_of_code: int = 0
|
||||
source_lines_of_code: int = 0
|
||||
logical_lines_of_code: int = 0
|
||||
comment_lines: int = 0
|
||||
blank_lines: int = 0
|
||||
|
||||
# Function/class counts
|
||||
function_count: int = 0
|
||||
class_count: int = 0
|
||||
method_count: int = 0
|
||||
|
||||
# Nesting and depth metrics
|
||||
max_nesting_depth: int = 0
|
||||
average_nesting_depth: float = 0.0
|
||||
|
||||
# Additional metrics
|
||||
parameters_count: int = 0
|
||||
variables_count: int = 0
|
||||
returns_count: int = 0
|
||||
|
||||
def to_dict(self) -> dict[str, Any]:
|
||||
"""Convert to dictionary representation."""
|
||||
return {
|
||||
"cyclomatic_complexity": self.cyclomatic_complexity,
|
||||
"cognitive_complexity": self.cognitive_complexity,
|
||||
"halstead_difficulty": self.halstead_difficulty,
|
||||
"halstead_effort": self.halstead_effort,
|
||||
"halstead_volume": self.halstead_volume,
|
||||
"halstead_time": self.halstead_time,
|
||||
"halstead_bugs": self.halstead_bugs,
|
||||
"maintainability_index": self.maintainability_index,
|
||||
"lines_of_code": self.lines_of_code,
|
||||
"source_lines_of_code": self.source_lines_of_code,
|
||||
"logical_lines_of_code": self.logical_lines_of_code,
|
||||
"comment_lines": self.comment_lines,
|
||||
"blank_lines": self.blank_lines,
|
||||
"function_count": self.function_count,
|
||||
"class_count": self.class_count,
|
||||
"method_count": self.method_count,
|
||||
"max_nesting_depth": self.max_nesting_depth,
|
||||
"average_nesting_depth": self.average_nesting_depth,
|
||||
"parameters_count": self.parameters_count,
|
||||
"variables_count": self.variables_count,
|
||||
"returns_count": self.returns_count,
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, data: dict[str, Any]) -> "ComplexityMetrics":
|
||||
"""Create from dictionary representation."""
|
||||
return cls(**data)
|
||||
|
||||
def get_overall_score(self) -> float:
|
||||
"""Calculate overall complexity score (0-100, lower is better)."""
|
||||
# Weighted combination of different metrics
|
||||
# Higher weights for more important complexity indicators
|
||||
|
||||
score = 0.0
|
||||
|
||||
# Cyclomatic complexity (weight: 30%)
|
||||
cyclomatic_score = min(self.cyclomatic_complexity * 2, 100)
|
||||
score += cyclomatic_score * 0.3
|
||||
|
||||
# Cognitive complexity (weight: 30%)
|
||||
cognitive_score = min(self.cognitive_complexity * 2, 100)
|
||||
score += cognitive_score * 0.3
|
||||
|
||||
# Maintainability index (weight: 20%, inverted since higher is better)
|
||||
maintainability_score = max(100 - self.maintainability_index, 0)
|
||||
score += maintainability_score * 0.2
|
||||
|
||||
# Nesting depth (weight: 10%)
|
||||
nesting_score = min(self.max_nesting_depth * 10, 100)
|
||||
score += nesting_score * 0.1
|
||||
|
||||
# Halstead difficulty (weight: 10%)
|
||||
halstead_score = min(self.halstead_difficulty * 3, 100)
|
||||
score += halstead_score * 0.1
|
||||
|
||||
return round(score, 2)
|
||||
|
||||
def get_complexity_level(self) -> str:
|
||||
"""Get human-readable complexity level."""
|
||||
score = self.get_overall_score()
|
||||
|
||||
if score < 20:
|
||||
return "Low"
|
||||
elif score < 40:
|
||||
return "Moderate"
|
||||
elif score < 60:
|
||||
return "High"
|
||||
elif score < 80:
|
||||
return "Very High"
|
||||
else:
|
||||
return "Extreme"
|
||||
|
||||
def get_priority_score(self) -> float:
|
||||
"""Get priority score for refactoring (0-1, higher means higher priority)."""
|
||||
overall_score = self.get_overall_score()
|
||||
|
||||
# Convert to 0-1 scale
|
||||
priority = overall_score / 100.0
|
||||
|
||||
# Boost priority for extreme cases
|
||||
if self.cyclomatic_complexity > 20:
|
||||
priority = min(priority + 0.2, 1.0)
|
||||
if self.cognitive_complexity > 25:
|
||||
priority = min(priority + 0.2, 1.0)
|
||||
if self.max_nesting_depth > 5:
|
||||
priority = min(priority + 0.1, 1.0)
|
||||
|
||||
return round(priority, 3)
|
||||
|
||||
def get_recommendations(self) -> list[str]:
|
||||
"""Get complexity reduction recommendations."""
|
||||
recommendations = []
|
||||
|
||||
if self.cyclomatic_complexity > 10:
|
||||
recommendations.append(
|
||||
f"High cyclomatic complexity ({self.cyclomatic_complexity}). "
|
||||
"Consider breaking down complex conditional logic."
|
||||
)
|
||||
|
||||
if self.cognitive_complexity > 15:
|
||||
recommendations.append(
|
||||
f"High cognitive complexity ({self.cognitive_complexity}). "
|
||||
"Consider extracting nested logic into separate methods."
|
||||
)
|
||||
|
||||
if self.max_nesting_depth > 4:
|
||||
recommendations.append(
|
||||
f"Deep nesting detected ({self.max_nesting_depth} levels). "
|
||||
"Consider using guard clauses or early returns."
|
||||
)
|
||||
|
||||
if self.maintainability_index < 20:
|
||||
recommendations.append(
|
||||
f"Low maintainability index ({self.maintainability_index:.1f}). "
|
||||
"Consider refactoring for better readability and simplicity."
|
||||
)
|
||||
|
||||
if self.halstead_difficulty > 20:
|
||||
recommendations.append(
|
||||
f"High Halstead difficulty ({self.halstead_difficulty:.1f}). "
|
||||
"Code may be hard to understand and maintain."
|
||||
)
|
||||
|
||||
if self.function_count == 0 and self.lines_of_code > 50:
|
||||
recommendations.append(
|
||||
"Large code block without functions. "
|
||||
"Consider extracting reusable functions."
|
||||
)
|
||||
|
||||
if self.parameters_count > 5:
|
||||
recommendations.append(
|
||||
f"Many parameters ({self.parameters_count}). "
|
||||
"Consider using parameter objects or configuration classes."
|
||||
)
|
||||
|
||||
return recommendations
|
||||
348
src/quality/complexity/radon_integration.py
Normal file
348
src/quality/complexity/radon_integration.py
Normal file
@@ -0,0 +1,348 @@
|
||||
"""Radon integration for professional complexity analysis."""
|
||||
|
||||
import ast
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
try:
|
||||
from radon.complexity import cc_rank, cc_visit
|
||||
from radon.metrics import h_visit, mi_visit
|
||||
from radon.raw import analyze
|
||||
|
||||
RADON_AVAILABLE = True
|
||||
except ImportError:
|
||||
RADON_AVAILABLE = False
|
||||
|
||||
from .calculator import ComplexityCalculator
|
||||
from .metrics import ComplexityMetrics
|
||||
|
||||
|
||||
class RadonComplexityAnalyzer:
|
||||
"""Professional complexity analyzer using Radon library."""
|
||||
|
||||
def __init__(self, fallback_to_manual: bool = True):
|
||||
self.fallback_to_manual = fallback_to_manual
|
||||
self.manual_calculator = ComplexityCalculator()
|
||||
|
||||
def analyze_code(self, code: str, filename: str = "<string>") -> ComplexityMetrics:
|
||||
"""Analyze code complexity using Radon or fallback to manual calculation."""
|
||||
if RADON_AVAILABLE:
|
||||
return self._analyze_with_radon(code, filename)
|
||||
elif self.fallback_to_manual:
|
||||
return self.manual_calculator.calculate_complexity(code)
|
||||
else:
|
||||
raise ImportError("Radon is not available and fallback is disabled")
|
||||
|
||||
def analyze_file(self, file_path: Path) -> ComplexityMetrics:
|
||||
"""Analyze complexity of a file."""
|
||||
try:
|
||||
with open(file_path, encoding="utf-8") as f:
|
||||
code = f.read()
|
||||
return self.analyze_code(code, str(file_path))
|
||||
except Exception:
|
||||
# Return empty metrics for unreadable files
|
||||
return ComplexityMetrics()
|
||||
|
||||
def _analyze_with_radon(self, code: str, filename: str) -> ComplexityMetrics:
|
||||
"""Analyze code using Radon library."""
|
||||
metrics = ComplexityMetrics()
|
||||
|
||||
try:
|
||||
# Raw metrics (lines of code, etc.)
|
||||
raw_metrics = analyze(code)
|
||||
if raw_metrics:
|
||||
metrics.lines_of_code = raw_metrics.loc
|
||||
metrics.logical_lines_of_code = raw_metrics.lloc
|
||||
metrics.source_lines_of_code = raw_metrics.sloc
|
||||
metrics.comment_lines = raw_metrics.comments
|
||||
metrics.blank_lines = raw_metrics.blank
|
||||
|
||||
# Cyclomatic complexity
|
||||
cc_results = cc_visit(code)
|
||||
if cc_results:
|
||||
# Sum up complexity from all functions/methods
|
||||
total_complexity = sum(block.complexity for block in cc_results)
|
||||
metrics.cyclomatic_complexity = total_complexity
|
||||
|
||||
# Count functions and classes
|
||||
metrics.function_count = len(
|
||||
[b for b in cc_results if b.is_method or b.type == "function"]
|
||||
)
|
||||
metrics.class_count = len([b for b in cc_results if b.type == "class"])
|
||||
metrics.method_count = len([b for b in cc_results if b.is_method])
|
||||
|
||||
# Halstead metrics
|
||||
try:
|
||||
halstead_data = h_visit(code)
|
||||
if halstead_data:
|
||||
metrics.halstead_difficulty = halstead_data.difficulty
|
||||
metrics.halstead_effort = halstead_data.effort
|
||||
metrics.halstead_volume = halstead_data.volume
|
||||
metrics.halstead_time = halstead_data.time
|
||||
metrics.halstead_bugs = halstead_data.bugs
|
||||
except Exception:
|
||||
# Halstead calculation can fail for some code patterns
|
||||
pass
|
||||
|
||||
# Maintainability Index
|
||||
try:
|
||||
mi_data = mi_visit(code, multi=True)
|
||||
if mi_data and hasattr(mi_data, "mi"):
|
||||
metrics.maintainability_index = mi_data.mi
|
||||
except Exception:
|
||||
# MI calculation can fail, calculate manually
|
||||
metrics.maintainability_index = self._calculate_mi_fallback(metrics)
|
||||
|
||||
# Calculate additional metrics manually
|
||||
metrics = self._enhance_with_manual_metrics(code, metrics)
|
||||
|
||||
except Exception:
|
||||
# If Radon fails completely, fallback to manual calculation
|
||||
if self.fallback_to_manual:
|
||||
return self.manual_calculator.calculate_complexity(code)
|
||||
else:
|
||||
raise
|
||||
|
||||
return metrics
|
||||
|
||||
def _enhance_with_manual_metrics(
|
||||
self, code: str, metrics: ComplexityMetrics
|
||||
) -> ComplexityMetrics:
|
||||
"""Add metrics not provided by Radon using manual calculation."""
|
||||
import ast
|
||||
|
||||
try:
|
||||
tree = ast.parse(code)
|
||||
|
||||
# Calculate cognitive complexity manually
|
||||
metrics.cognitive_complexity = self._calculate_cognitive_complexity(tree)
|
||||
|
||||
# Calculate nesting metrics
|
||||
max_depth, avg_depth = self._calculate_nesting_metrics(tree)
|
||||
metrics.max_nesting_depth = max_depth
|
||||
metrics.average_nesting_depth = avg_depth
|
||||
|
||||
# Count variables, parameters, returns
|
||||
for node in ast.walk(tree):
|
||||
if isinstance(node, ast.FunctionDef | ast.AsyncFunctionDef):
|
||||
metrics.parameters_count += len(node.args.args)
|
||||
metrics.returns_count += len(
|
||||
[n for n in ast.walk(node) if isinstance(n, ast.Return)]
|
||||
)
|
||||
|
||||
# Count variables
|
||||
variables = set()
|
||||
for node in ast.walk(tree):
|
||||
if isinstance(node, ast.Name) and isinstance(
|
||||
node.ctx, ast.Store | ast.Del
|
||||
):
|
||||
variables.add(node.id)
|
||||
metrics.variables_count = len(variables)
|
||||
|
||||
except SyntaxError:
|
||||
# If AST parsing fails, keep existing metrics
|
||||
pass
|
||||
|
||||
return metrics
|
||||
|
||||
def _calculate_cognitive_complexity(self, tree: ast.AST) -> int:
|
||||
"""Calculate cognitive complexity manually."""
|
||||
complexity = 0
|
||||
|
||||
def visit_node(node: ast.AST, depth: int = 0) -> None:
|
||||
nonlocal complexity
|
||||
local_complexity = 0
|
||||
|
||||
if isinstance(
|
||||
node,
|
||||
ast.If
|
||||
| ast.While
|
||||
| ast.For
|
||||
| ast.AsyncFor
|
||||
| ast.ExceptHandler
|
||||
| ast.With,
|
||||
):
|
||||
local_complexity += 1 + depth
|
||||
elif isinstance(node, ast.BoolOp):
|
||||
local_complexity += len(node.values) - 1
|
||||
elif (
|
||||
isinstance(node, ast.Lambda)
|
||||
or isinstance(node, ast.Expr)
|
||||
and isinstance(node.value, ast.IfExp)
|
||||
):
|
||||
local_complexity += 1
|
||||
|
||||
complexity += local_complexity
|
||||
|
||||
# Increase nesting for control structures
|
||||
new_depth = (
|
||||
depth + 1
|
||||
if isinstance(
|
||||
node,
|
||||
ast.If
|
||||
| ast.While
|
||||
| ast.For
|
||||
| ast.AsyncFor
|
||||
| ast.ExceptHandler
|
||||
| ast.With,
|
||||
)
|
||||
else depth
|
||||
)
|
||||
|
||||
for child in ast.iter_child_nodes(node):
|
||||
visit_node(child, new_depth)
|
||||
|
||||
visit_node(tree)
|
||||
return complexity
|
||||
|
||||
def _calculate_nesting_metrics(self, tree: ast.AST) -> tuple[int, float]:
|
||||
"""Calculate nesting depth metrics."""
|
||||
depths = []
|
||||
|
||||
def visit_node(node: ast.AST, depth: int = 0) -> None:
|
||||
current_depth = depth
|
||||
|
||||
if isinstance(
|
||||
node, ast.If | ast.While | ast.For | ast.AsyncFor | ast.With | ast.Try
|
||||
):
|
||||
current_depth += 1
|
||||
depths.append(current_depth)
|
||||
|
||||
for child in ast.iter_child_nodes(node):
|
||||
visit_node(child, current_depth)
|
||||
|
||||
visit_node(tree)
|
||||
|
||||
max_depth = max(depths) if depths else 0
|
||||
avg_depth = sum(depths) / len(depths) if depths else 0.0
|
||||
|
||||
return max_depth, round(avg_depth, 2)
|
||||
|
||||
def _calculate_mi_fallback(self, metrics: ComplexityMetrics) -> float:
|
||||
"""Calculate maintainability index when Radon fails."""
|
||||
import math
|
||||
|
||||
if metrics.halstead_volume <= 0 or metrics.source_lines_of_code <= 0:
|
||||
return 100.0
|
||||
|
||||
try:
|
||||
mi = (
|
||||
171
|
||||
- 5.2 * math.log(metrics.halstead_volume)
|
||||
- 0.23 * metrics.cyclomatic_complexity
|
||||
- 16.2 * math.log(metrics.source_lines_of_code)
|
||||
)
|
||||
|
||||
return max(0, min(100, round(mi, 2)))
|
||||
except (ValueError, ZeroDivisionError):
|
||||
return 50.0
|
||||
|
||||
def get_complexity_rank(self, complexity_score: int) -> str:
|
||||
"""Get complexity rank using Radon's ranking system."""
|
||||
if not RADON_AVAILABLE:
|
||||
# Manual ranking
|
||||
if complexity_score <= 5:
|
||||
return "A" # Low
|
||||
elif complexity_score <= 10:
|
||||
return "B" # Moderate
|
||||
elif complexity_score <= 20:
|
||||
return "C" # High
|
||||
elif complexity_score <= 30:
|
||||
return "D" # Very High
|
||||
else:
|
||||
return "F" # Extreme
|
||||
|
||||
return cc_rank(complexity_score)
|
||||
|
||||
def batch_analyze_files(
|
||||
self, file_paths: list[Path], max_workers: int | None = None
|
||||
) -> dict[Path, ComplexityMetrics]:
|
||||
"""Analyze multiple files in parallel."""
|
||||
import concurrent.futures
|
||||
import os
|
||||
|
||||
if max_workers is None:
|
||||
max_workers = os.cpu_count() or 4
|
||||
|
||||
results = {}
|
||||
|
||||
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
|
||||
# Submit all tasks
|
||||
future_to_path = {
|
||||
executor.submit(self.analyze_file, path): path for path in file_paths
|
||||
}
|
||||
|
||||
# Collect results
|
||||
for future in concurrent.futures.as_completed(future_to_path):
|
||||
path = future_to_path[future]
|
||||
try:
|
||||
results[path] = future.result()
|
||||
except Exception:
|
||||
# Create empty metrics for failed files
|
||||
results[path] = ComplexityMetrics()
|
||||
|
||||
return results
|
||||
|
||||
def get_detailed_complexity_report(
|
||||
self, code: str, filename: str = "<string>"
|
||||
) -> dict[str, Any]:
|
||||
"""Get detailed complexity report including function-level analysis."""
|
||||
if not RADON_AVAILABLE:
|
||||
metrics = self.manual_calculator.calculate_complexity(code)
|
||||
return {
|
||||
"file_metrics": metrics.to_dict(),
|
||||
"functions": [],
|
||||
"classes": [],
|
||||
"radon_available": False,
|
||||
}
|
||||
|
||||
metrics = self._analyze_with_radon(code, filename)
|
||||
|
||||
# Get function-level details from Radon
|
||||
functions = []
|
||||
classes = []
|
||||
|
||||
try:
|
||||
cc_results = cc_visit(code)
|
||||
for block in cc_results:
|
||||
item = {
|
||||
"name": block.name,
|
||||
"complexity": block.complexity,
|
||||
"rank": self.get_complexity_rank(block.complexity),
|
||||
"line_number": block.lineno,
|
||||
"end_line": getattr(block, "endline", None),
|
||||
"type": block.type,
|
||||
"is_method": getattr(block, "is_method", False),
|
||||
}
|
||||
|
||||
if block.type == "function" or getattr(block, "is_method", False):
|
||||
functions.append(item)
|
||||
elif block.type == "class":
|
||||
classes.append(item)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return {
|
||||
"file_metrics": metrics.to_dict(),
|
||||
"functions": functions,
|
||||
"classes": classes,
|
||||
"radon_available": True,
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
def is_available() -> bool:
|
||||
"""Check if Radon is available."""
|
||||
return RADON_AVAILABLE
|
||||
|
||||
@staticmethod
|
||||
def get_radon_version() -> str | None:
|
||||
"""Get Radon version if available."""
|
||||
if not RADON_AVAILABLE:
|
||||
return None
|
||||
|
||||
try:
|
||||
import radon
|
||||
|
||||
return getattr(radon, "__version__", "unknown")
|
||||
except AttributeError:
|
||||
return "unknown"
|
||||
5
src/quality/config/__init__.py
Normal file
5
src/quality/config/__init__.py
Normal file
@@ -0,0 +1,5 @@
|
||||
"""Configuration management for code quality analysis."""
|
||||
|
||||
from .schemas import QualityConfig, load_config
|
||||
|
||||
__all__ = ["QualityConfig", "load_config"]
|
||||
128
src/quality/config/default_config.yaml
Normal file
128
src/quality/config/default_config.yaml
Normal file
@@ -0,0 +1,128 @@
|
||||
# Default configuration for IntelliKit Quality Analysis
|
||||
|
||||
# Detection settings
|
||||
detection:
|
||||
min_lines: 5 # Minimum lines for duplicate detection
|
||||
min_tokens: 50 # Minimum tokens for duplicate detection
|
||||
similarity_threshold: 0.8 # Similarity threshold (0.0-1.0)
|
||||
|
||||
# Similarity algorithm weights (should sum to ~1.0) - Optimized for better accuracy
|
||||
similarity_algorithms:
|
||||
- name: structural
|
||||
weight: 0.5
|
||||
enabled: true
|
||||
- name: cosine
|
||||
weight: 0.2
|
||||
enabled: true
|
||||
- name: jaccard
|
||||
weight: 0.15
|
||||
enabled: true
|
||||
- name: levenshtein
|
||||
weight: 0.1
|
||||
enabled: true
|
||||
- name: semantic
|
||||
weight: 0.05
|
||||
enabled: true
|
||||
|
||||
# Performance settings - Optimized LSH parameters
|
||||
use_lsh: true # Use LSH for large codebases
|
||||
lsh_threshold: 500 # Use LSH when blocks > this number (reduced for better coverage)
|
||||
lsh_bands: 20 # Number of LSH bands (increased for better precision)
|
||||
lsh_rows: 4 # Rows per band (decreased to balance precision/recall)
|
||||
lsh_num_perm: 256 # Number of permutations (increased for better accuracy)
|
||||
parallel_processing: true # Enable parallel processing
|
||||
max_workers: null # Auto-detect CPU cores
|
||||
|
||||
# Complexity analysis
|
||||
complexity:
|
||||
include_cyclomatic: true # Include McCabe complexity
|
||||
include_cognitive: true # Include cognitive complexity
|
||||
include_halstead: true # Include Halstead metrics
|
||||
include_maintainability: true # Include maintainability index
|
||||
complexity_threshold: 10 # Threshold for flagging complex code
|
||||
|
||||
# Language support
|
||||
languages:
|
||||
languages:
|
||||
- python
|
||||
- javascript
|
||||
- typescript
|
||||
file_extensions:
|
||||
python: [".py", ".pyx", ".pyi"]
|
||||
javascript: [".js", ".jsx", ".es6", ".mjs"]
|
||||
typescript: [".ts", ".tsx"]
|
||||
|
||||
# File path configuration
|
||||
paths:
|
||||
include_patterns:
|
||||
- "**/*.py"
|
||||
- "**/*.js"
|
||||
- "**/*.ts"
|
||||
exclude_patterns:
|
||||
- "**/__pycache__/**"
|
||||
- "**/*.pyc"
|
||||
- "**/venv/**"
|
||||
- "**/.venv/**"
|
||||
- "**/node_modules/**"
|
||||
- "**/.git/**"
|
||||
- "**/build/**"
|
||||
- "**/dist/**"
|
||||
- "**/migrations/**"
|
||||
max_files: null # No limit
|
||||
follow_symlinks: false
|
||||
|
||||
# Refactoring suggestions
|
||||
refactoring:
|
||||
enabled: true
|
||||
min_priority_score: 1.0 # Minimum priority for suggestions
|
||||
suggest_extract_method: true
|
||||
suggest_extract_class: true
|
||||
suggest_parameter_object: true
|
||||
suggest_template_method: true
|
||||
estimate_effort: true # Include effort estimates
|
||||
risk_threshold: 0.7 # Risk threshold for suggestions
|
||||
|
||||
# Reporting configuration
|
||||
reporting:
|
||||
formats: ["console"] # Output formats
|
||||
output_dir: "./quality_reports"
|
||||
|
||||
# Console output settings
|
||||
show_code_preview: true
|
||||
show_complexity_metrics: true
|
||||
show_refactoring_suggestions: true
|
||||
|
||||
# Dashboard settings
|
||||
dashboard_enabled: false
|
||||
dashboard_port: 8080
|
||||
dashboard_host: "localhost"
|
||||
|
||||
# Export formats
|
||||
export_sarif: false # IDE integration format
|
||||
export_json: false
|
||||
export_html: false
|
||||
export_csv: false
|
||||
|
||||
# Cache configuration
|
||||
cache:
|
||||
enabled: true
|
||||
cache_dir: ".quality_cache"
|
||||
max_age_days: 7 # Cache expiry in days
|
||||
use_memory_cache: true
|
||||
|
||||
# External integrations
|
||||
integrations:
|
||||
# Git integration
|
||||
use_git: true
|
||||
analyze_git_history: false # Analyze historical changes
|
||||
blame_duplicates: false # Show git blame for duplicates
|
||||
|
||||
# JSCPD for multi-language support
|
||||
use_jscpd: true
|
||||
jscpd_path: null # Auto-detect jscpd path
|
||||
jscpd_config: {} # Additional jscpd options
|
||||
|
||||
# Global settings
|
||||
version: "1.0.0"
|
||||
debug: false
|
||||
verbose: false
|
||||
294
src/quality/config/schemas.py
Normal file
294
src/quality/config/schemas.py
Normal file
@@ -0,0 +1,294 @@
|
||||
"""Configuration schemas using Pydantic."""
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
import yaml
|
||||
from pydantic import BaseModel, field_validator
|
||||
try:
|
||||
from pydantic import Field
|
||||
except ImportError:
|
||||
from pydantic.v1 import Field
|
||||
|
||||
|
||||
class SimilarityAlgorithmConfig(BaseModel):
|
||||
"""Configuration for similarity algorithms."""
|
||||
|
||||
name: str
|
||||
weight: float = Field(default=1.0, ge=0.0, le=1.0)
|
||||
enabled: bool = True
|
||||
parameters: dict[str, str | int | float | bool] = Field(default_factory=dict)
|
||||
|
||||
|
||||
class ComplexityConfig(BaseModel):
|
||||
"""Configuration for complexity analysis."""
|
||||
|
||||
include_cyclomatic: bool = True
|
||||
include_cognitive: bool = True
|
||||
include_halstead: bool = True
|
||||
include_maintainability: bool = True
|
||||
complexity_threshold: int = Field(default=10, ge=1)
|
||||
|
||||
|
||||
class DetectionConfig(BaseModel):
|
||||
"""Configuration for duplicate detection."""
|
||||
|
||||
min_lines: int = Field(default=5, ge=1)
|
||||
min_tokens: int = Field(default=50, ge=1)
|
||||
similarity_threshold: float = Field(default=0.8, ge=0.0, le=1.0)
|
||||
|
||||
# Similarity algorithms
|
||||
similarity_algorithms: list[SimilarityAlgorithmConfig] = Field(
|
||||
default_factory=lambda: [
|
||||
SimilarityAlgorithmConfig(name="levenshtein", weight=0.2),
|
||||
SimilarityAlgorithmConfig(name="jaccard", weight=0.3),
|
||||
SimilarityAlgorithmConfig(name="cosine", weight=0.3),
|
||||
SimilarityAlgorithmConfig(name="semantic", weight=0.2),
|
||||
]
|
||||
)
|
||||
|
||||
# Performance settings
|
||||
use_lsh: bool = True
|
||||
lsh_threshold: int = Field(
|
||||
default=1000, ge=100
|
||||
) # Use LSH for datasets larger than this
|
||||
parallel_processing: bool = True
|
||||
max_workers: int | None = None
|
||||
|
||||
|
||||
class LanguageConfig(BaseModel):
|
||||
"""Configuration for language support."""
|
||||
|
||||
languages: set[str] = Field(default_factory=lambda: {"python"})
|
||||
file_extensions: dict[str, list[str]] = Field(
|
||||
default_factory=lambda: {
|
||||
"python": [".py", ".pyx", ".pyi"],
|
||||
"javascript": [".js", ".jsx", ".es6", ".mjs"],
|
||||
"typescript": [".ts", ".tsx"],
|
||||
"java": [".java"],
|
||||
"c": [".c", ".h"],
|
||||
"cpp": [".cpp", ".cxx", ".cc", ".hpp", ".hxx"],
|
||||
"csharp": [".cs"],
|
||||
"go": [".go"],
|
||||
"rust": [".rs"],
|
||||
"php": [".php"],
|
||||
"ruby": [".rb"],
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
class PathConfig(BaseModel):
|
||||
"""Configuration for file paths."""
|
||||
|
||||
include_patterns: list[str] = Field(default_factory=lambda: ["**/*.py"])
|
||||
exclude_patterns: list[str] = Field(
|
||||
default_factory=lambda: [
|
||||
"**/__pycache__/**",
|
||||
"**/*.pyc",
|
||||
"**/venv/**",
|
||||
"**/.venv/**",
|
||||
"**/node_modules/**",
|
||||
"**/.git/**",
|
||||
"**/build/**",
|
||||
"**/dist/**",
|
||||
]
|
||||
)
|
||||
max_files: int | None = None
|
||||
follow_symlinks: bool = False
|
||||
|
||||
|
||||
class RefactoringConfig(BaseModel):
|
||||
"""Configuration for refactoring suggestions."""
|
||||
|
||||
enabled: bool = True
|
||||
min_priority_score: float = Field(default=1.0, ge=0.0)
|
||||
suggest_extract_method: bool = True
|
||||
suggest_extract_class: bool = True
|
||||
suggest_parameter_object: bool = True
|
||||
suggest_template_method: bool = True
|
||||
estimate_effort: bool = True
|
||||
risk_threshold: float = Field(default=0.7, ge=0.0, le=1.0)
|
||||
|
||||
|
||||
class ReportingConfig(BaseModel):
|
||||
"""Configuration for reporting."""
|
||||
|
||||
formats: list[str] = Field(default_factory=lambda: ["console"])
|
||||
output_dir: Path = Field(default=Path("./quality_reports"))
|
||||
|
||||
# Console reporting
|
||||
show_code_preview: bool = True
|
||||
show_complexity_metrics: bool = True
|
||||
show_refactoring_suggestions: bool = True
|
||||
|
||||
# Dashboard settings
|
||||
dashboard_enabled: bool = False
|
||||
dashboard_port: int = Field(default=8080, ge=1024, le=65535)
|
||||
dashboard_host: str = "localhost"
|
||||
|
||||
# Export formats
|
||||
export_sarif: bool = False
|
||||
export_json: bool = False
|
||||
export_html: bool = False
|
||||
export_csv: bool = False
|
||||
|
||||
|
||||
class CacheConfig(BaseModel):
|
||||
"""Configuration for caching."""
|
||||
|
||||
enabled: bool = True
|
||||
cache_dir: Path = Field(default=Path(".quality_cache"))
|
||||
max_age_days: int = Field(default=7, ge=1)
|
||||
use_memory_cache: bool = True
|
||||
|
||||
|
||||
class IntegrationConfig(BaseModel):
|
||||
"""Configuration for external integrations."""
|
||||
|
||||
# Git integration
|
||||
use_git: bool = True
|
||||
analyze_git_history: bool = False
|
||||
blame_duplicates: bool = False
|
||||
|
||||
# JSCPD integration for multi-language support
|
||||
use_jscpd: bool = True
|
||||
jscpd_path: str | None = None
|
||||
jscpd_config: dict[str, str | int | float | bool] = Field(default_factory=dict)
|
||||
|
||||
|
||||
class ExceptionRule(BaseModel):
|
||||
"""Configuration for a single exception rule."""
|
||||
|
||||
analysis_type: str # "complexity", "duplicates", "modernization", "code_smells"
|
||||
issue_type: str | None = None # Specific issue type (optional)
|
||||
file_patterns: list[str] = Field(default_factory=list) # File path patterns
|
||||
line_patterns: list[str] = Field(default_factory=list) # Line content patterns
|
||||
reason: str | None = None # Optional reason for the exception
|
||||
expires: str | None = None # Optional expiration date (YYYY-MM-DD)
|
||||
enabled: bool = True
|
||||
|
||||
|
||||
class ExceptionsConfig(BaseModel):
|
||||
"""Configuration for analysis exceptions."""
|
||||
|
||||
enabled: bool = True
|
||||
rules: list[ExceptionRule] = Field(default_factory=list)
|
||||
|
||||
# Global file/directory exceptions
|
||||
exclude_files: list[str] = Field(default_factory=list)
|
||||
exclude_directories: list[str] = Field(default_factory=list)
|
||||
|
||||
# Temporary suppressions (auto-expire)
|
||||
temporary_suppressions: dict[str, str] = Field(
|
||||
default_factory=dict
|
||||
) # rule_id -> expiry_date
|
||||
|
||||
|
||||
class QualityConfig(BaseModel):
|
||||
"""Main configuration for code quality analysis."""
|
||||
|
||||
# Core configuration sections
|
||||
detection: DetectionConfig = Field(default_factory=DetectionConfig)
|
||||
complexity: ComplexityConfig = Field(default_factory=ComplexityConfig)
|
||||
languages: LanguageConfig = Field(default_factory=LanguageConfig)
|
||||
paths: PathConfig = Field(default_factory=PathConfig)
|
||||
refactoring: RefactoringConfig = Field(default_factory=RefactoringConfig)
|
||||
reporting: ReportingConfig = Field(default_factory=ReportingConfig)
|
||||
cache: CacheConfig = Field(default_factory=CacheConfig)
|
||||
integrations: IntegrationConfig = Field(default_factory=IntegrationConfig)
|
||||
exceptions: ExceptionsConfig = Field(default_factory=ExceptionsConfig)
|
||||
|
||||
# Global settings
|
||||
version: str = "1.0.0"
|
||||
debug: bool = False
|
||||
verbose: bool = False
|
||||
|
||||
@field_validator("detection")
|
||||
def validate_similarity_weights(cls, v):
|
||||
"""Ensure similarity algorithm weights sum to approximately 1.0."""
|
||||
total_weight = sum(alg.weight for alg in v.similarity_algorithms if alg.enabled)
|
||||
if abs(total_weight - 1.0) > 0.1:
|
||||
# Auto-normalize weights
|
||||
for alg in v.similarity_algorithms:
|
||||
if alg.enabled:
|
||||
alg.weight = alg.weight / total_weight
|
||||
return v
|
||||
|
||||
class Config:
|
||||
"""Pydantic configuration."""
|
||||
|
||||
validate_assignment = True
|
||||
extra = "forbid"
|
||||
|
||||
|
||||
def load_config(config_path: Path | None = None) -> QualityConfig:
|
||||
"""Load configuration from file or use defaults."""
|
||||
if config_path is None:
|
||||
# Look for config files in common locations
|
||||
possible_paths = [
|
||||
Path("quality.yaml"),
|
||||
Path("quality.yml"),
|
||||
Path(".quality.yaml"),
|
||||
Path(".quality.yml"),
|
||||
Path("pyproject.toml"), # Look for [tool.quality] section
|
||||
]
|
||||
|
||||
for path in possible_paths:
|
||||
if path.exists():
|
||||
config_path = path
|
||||
break
|
||||
|
||||
if config_path and config_path.exists():
|
||||
return _load_from_file(config_path)
|
||||
else:
|
||||
return QualityConfig()
|
||||
|
||||
|
||||
def _load_from_file(config_path: Path) -> QualityConfig:
|
||||
"""Load configuration from specific file."""
|
||||
if config_path.suffix.lower() in [".yaml", ".yml"]:
|
||||
return _load_from_yaml(config_path)
|
||||
elif config_path.name == "pyproject.toml":
|
||||
return _load_from_pyproject(config_path)
|
||||
else:
|
||||
raise ValueError(f"Unsupported config file format: {config_path}")
|
||||
|
||||
|
||||
def _load_from_yaml(config_path: Path) -> QualityConfig:
|
||||
"""Load configuration from YAML file."""
|
||||
with open(config_path, encoding="utf-8") as f:
|
||||
data = yaml.safe_load(f)
|
||||
|
||||
return QualityConfig(**data) if data else QualityConfig()
|
||||
|
||||
|
||||
def _load_from_pyproject(config_path: Path) -> QualityConfig:
|
||||
"""Load configuration from pyproject.toml file."""
|
||||
try:
|
||||
import tomli
|
||||
except ImportError:
|
||||
try:
|
||||
import tomllib as tomli
|
||||
except ImportError as e:
|
||||
raise ImportError(
|
||||
"tomli package required to read pyproject.toml. "
|
||||
"Install with: pip install tomli"
|
||||
) from e
|
||||
|
||||
with open(config_path, "rb") as f:
|
||||
data = tomli.load(f)
|
||||
|
||||
# Extract quality configuration
|
||||
quality_config = data.get("tool", {}).get("quality", {})
|
||||
|
||||
return QualityConfig(**quality_config) if quality_config else QualityConfig()
|
||||
|
||||
|
||||
def save_config(config: QualityConfig, output_path: Path) -> None:
|
||||
"""Save configuration to YAML file."""
|
||||
with open(output_path, "w", encoding="utf-8") as f:
|
||||
yaml.dump(
|
||||
config.dict(exclude_defaults=True),
|
||||
f,
|
||||
default_flow_style=False,
|
||||
sort_keys=True,
|
||||
)
|
||||
23
src/quality/core/__init__.py
Normal file
23
src/quality/core/__init__.py
Normal file
@@ -0,0 +1,23 @@
|
||||
"""Core components for code quality analysis."""
|
||||
|
||||
from .ast_analyzer import ASTAnalyzer
|
||||
from .base import (
|
||||
AnalysisResult,
|
||||
CodeBlock,
|
||||
ComplexityMetrics,
|
||||
DuplicateMatch,
|
||||
RefactoringSuggestion,
|
||||
SimilarityAlgorithm,
|
||||
)
|
||||
from .cache import CacheManager
|
||||
|
||||
__all__ = [
|
||||
"AnalysisResult",
|
||||
"ASTAnalyzer",
|
||||
"CacheManager",
|
||||
"CodeBlock",
|
||||
"ComplexityMetrics",
|
||||
"DuplicateMatch",
|
||||
"RefactoringSuggestion",
|
||||
"SimilarityAlgorithm",
|
||||
]
|
||||
281
src/quality/core/ast_analyzer.py
Normal file
281
src/quality/core/ast_analyzer.py
Normal file
@@ -0,0 +1,281 @@
|
||||
"""Enhanced AST analysis for code quality detection."""
|
||||
|
||||
import ast
|
||||
|
||||
from .base import CodeBlock, ComplexityMetrics
|
||||
|
||||
|
||||
class ASTAnalyzer(ast.NodeVisitor):
|
||||
"""Enhanced AST visitor for extracting code structure and complexity metrics."""
|
||||
|
||||
def __init__(self, file_path: str = "", content: str = ""):
|
||||
self.file_path = file_path
|
||||
self.content_lines = content.splitlines() if content else []
|
||||
self.functions: list[CodeBlock] = []
|
||||
self.classes: list[CodeBlock] = []
|
||||
self.code_blocks: list[CodeBlock] = []
|
||||
self.imports: list[str] = []
|
||||
self.global_variables: set[str] = set()
|
||||
self.call_graph: dict[str, list[str]] = {}
|
||||
|
||||
def extract_code_blocks(self, file_path, min_lines: int = 5) -> list[CodeBlock]:
|
||||
"""Extract code blocks from a file."""
|
||||
try:
|
||||
with open(file_path, encoding="utf-8") as f:
|
||||
content = f.read()
|
||||
except (OSError, UnicodeDecodeError):
|
||||
return []
|
||||
|
||||
# Reset analyzer state
|
||||
self.__init__(str(file_path), content)
|
||||
|
||||
try:
|
||||
tree = ast.parse(content)
|
||||
except SyntaxError:
|
||||
return []
|
||||
else:
|
||||
self.visit(tree)
|
||||
|
||||
# Filter blocks by minimum size
|
||||
filtered_blocks = []
|
||||
for block in self.code_blocks:
|
||||
if (block.end_line - block.start_line + 1) >= min_lines:
|
||||
filtered_blocks.append(block)
|
||||
|
||||
return filtered_blocks
|
||||
|
||||
def visit_FunctionDef(self, node: ast.FunctionDef) -> None:
|
||||
"""Visit function definitions with complexity analysis."""
|
||||
complexity = self._calculate_cyclomatic_complexity(node)
|
||||
cognitive_complexity = self._calculate_cognitive_complexity(node)
|
||||
|
||||
metrics = ComplexityMetrics(
|
||||
cyclomatic_complexity=complexity, cognitive_complexity=cognitive_complexity
|
||||
)
|
||||
|
||||
block = self._extract_code_block(node, node.name, "function", metrics)
|
||||
self.functions.append(block)
|
||||
self._extract_function_calls(node, node.name)
|
||||
self.generic_visit(node)
|
||||
|
||||
def visit_AsyncFunctionDef(self, node: ast.AsyncFunctionDef) -> None:
|
||||
"""Visit async function definitions."""
|
||||
complexity = self._calculate_cyclomatic_complexity(node)
|
||||
cognitive_complexity = self._calculate_cognitive_complexity(node)
|
||||
|
||||
metrics = ComplexityMetrics(
|
||||
cyclomatic_complexity=complexity, cognitive_complexity=cognitive_complexity
|
||||
)
|
||||
|
||||
block = self._extract_code_block(node, node.name, "function", metrics)
|
||||
self.functions.append(block)
|
||||
self._extract_function_calls(node, node.name)
|
||||
self.generic_visit(node)
|
||||
|
||||
def visit_ClassDef(self, node: ast.ClassDef) -> None:
|
||||
"""Visit class definitions."""
|
||||
# Class complexity is sum of method complexities
|
||||
methods = [
|
||||
n
|
||||
for n in ast.walk(node)
|
||||
if isinstance(n, ast.FunctionDef | ast.AsyncFunctionDef)
|
||||
]
|
||||
total_complexity = sum(
|
||||
self._calculate_cyclomatic_complexity(method) for method in methods
|
||||
)
|
||||
|
||||
metrics = ComplexityMetrics(cyclomatic_complexity=total_complexity)
|
||||
block = self._extract_code_block(node, node.name, "class", metrics)
|
||||
self.classes.append(block)
|
||||
self.generic_visit(node)
|
||||
|
||||
def visit_Import(self, node: ast.Import) -> None:
|
||||
"""Track imports."""
|
||||
for alias in node.names:
|
||||
self.imports.append(alias.name)
|
||||
self.generic_visit(node)
|
||||
|
||||
def visit_ImportFrom(self, node: ast.ImportFrom) -> None:
|
||||
"""Track from imports."""
|
||||
if node.module:
|
||||
for alias in node.names:
|
||||
self.imports.append(f"{node.module}.{alias.name}")
|
||||
self.generic_visit(node)
|
||||
|
||||
def visit_Assign(self, node: ast.Assign) -> None:
|
||||
"""Track global variable assignments."""
|
||||
for target in node.targets:
|
||||
if isinstance(target, ast.Name):
|
||||
self.global_variables.add(target.id)
|
||||
self.generic_visit(node)
|
||||
|
||||
def _extract_code_block(
|
||||
self,
|
||||
node: ast.AST,
|
||||
name: str,
|
||||
block_type: str,
|
||||
complexity_metrics: ComplexityMetrics | None = None,
|
||||
) -> CodeBlock:
|
||||
"""Extract code block from AST node with enhanced metadata."""
|
||||
start_line = node.lineno
|
||||
end_line = getattr(node, "end_lineno", start_line)
|
||||
|
||||
if end_line is None:
|
||||
end_line = start_line
|
||||
|
||||
content = "\n".join(self.content_lines[start_line - 1 : end_line])
|
||||
|
||||
block = CodeBlock(
|
||||
file_path=self.file_path,
|
||||
start_line=start_line,
|
||||
end_line=end_line,
|
||||
content=content,
|
||||
complexity_metrics=complexity_metrics,
|
||||
function_name=name if block_type == "function" else None,
|
||||
class_name=name if block_type == "class" else None,
|
||||
)
|
||||
|
||||
self.code_blocks.append(block)
|
||||
return block
|
||||
|
||||
def _calculate_cyclomatic_complexity(self, node: ast.AST) -> int:
|
||||
"""Calculate McCabe cyclomatic complexity."""
|
||||
complexity = 1 # Base complexity
|
||||
|
||||
for child in ast.walk(node):
|
||||
if isinstance(
|
||||
child,
|
||||
ast.If
|
||||
| ast.While
|
||||
| ast.For
|
||||
| ast.AsyncFor
|
||||
| ast.ExceptHandler
|
||||
| ast.With
|
||||
| ast.AsyncWith
|
||||
| ast.Assert,
|
||||
):
|
||||
complexity += 1
|
||||
elif isinstance(child, ast.BoolOp):
|
||||
complexity += len(child.values) - 1
|
||||
elif isinstance(child, ast.Break | ast.Continue):
|
||||
complexity += 1
|
||||
|
||||
return complexity
|
||||
|
||||
def _calculate_cognitive_complexity(self, node: ast.AST) -> int:
|
||||
"""Calculate cognitive complexity (more human-oriented)."""
|
||||
complexity = 0
|
||||
nesting_level = 0
|
||||
|
||||
def visit_node(n: ast.AST, level: int) -> int:
|
||||
nonlocal complexity
|
||||
local_complexity = 0
|
||||
|
||||
if isinstance(
|
||||
n, ast.If | ast.While | ast.For | ast.AsyncFor | ast.ExceptHandler
|
||||
):
|
||||
local_complexity += 1 + level
|
||||
elif isinstance(n, ast.Break | ast.Continue):
|
||||
local_complexity += 1
|
||||
elif isinstance(n, ast.BoolOp):
|
||||
local_complexity += len(n.values) - 1
|
||||
|
||||
# Increase nesting for certain constructs
|
||||
if isinstance(
|
||||
n,
|
||||
ast.If
|
||||
| ast.While
|
||||
| ast.For
|
||||
| ast.AsyncFor
|
||||
| ast.With
|
||||
| ast.AsyncWith
|
||||
| ast.Try,
|
||||
):
|
||||
level += 1
|
||||
|
||||
for child in ast.iter_child_nodes(n):
|
||||
local_complexity += visit_node(child, level)
|
||||
|
||||
return local_complexity
|
||||
|
||||
return visit_node(node, nesting_level)
|
||||
|
||||
def _extract_function_calls(self, node: ast.AST, function_name: str) -> None:
|
||||
"""Extract function calls to build call graph."""
|
||||
calls = []
|
||||
|
||||
for child in ast.walk(node):
|
||||
if isinstance(child, ast.Call):
|
||||
if isinstance(child.func, ast.Name):
|
||||
calls.append(child.func.id)
|
||||
elif isinstance(child.func, ast.Attribute):
|
||||
calls.append(child.func.attr)
|
||||
|
||||
self.call_graph[function_name] = calls
|
||||
|
||||
def get_code_structure_signature(self, node: ast.AST) -> str:
|
||||
"""Generate structure signature for semantic comparison."""
|
||||
structure_elements = []
|
||||
|
||||
for child in ast.walk(node):
|
||||
if isinstance(child, ast.FunctionDef):
|
||||
structure_elements.append(f"func:{len(child.args.args)}")
|
||||
elif isinstance(child, ast.ClassDef):
|
||||
structure_elements.append(f"class:{len(child.bases)}")
|
||||
elif isinstance(child, ast.If):
|
||||
structure_elements.append("if")
|
||||
elif isinstance(child, ast.For):
|
||||
structure_elements.append("for")
|
||||
elif isinstance(child, ast.While):
|
||||
structure_elements.append("while")
|
||||
elif isinstance(child, ast.Try):
|
||||
structure_elements.append("try")
|
||||
|
||||
return "|".join(structure_elements)
|
||||
|
||||
def get_variable_usage_pattern(self, node: ast.AST) -> dict[str, int]:
|
||||
"""Analyze variable usage patterns."""
|
||||
variable_usage = {}
|
||||
|
||||
for child in ast.walk(node):
|
||||
if isinstance(child, ast.Name):
|
||||
name = child.id
|
||||
variable_usage[name] = variable_usage.get(name, 0) + 1
|
||||
|
||||
return variable_usage
|
||||
|
||||
def detect_code_smells(self) -> list[str]:
|
||||
"""Detect common code smells."""
|
||||
smells = []
|
||||
|
||||
# Long methods
|
||||
long_methods = [f for f in self.functions if f.lines_count > 30]
|
||||
if long_methods:
|
||||
smells.append(
|
||||
f"Long methods detected: {len(long_methods)} methods > 30 lines"
|
||||
)
|
||||
|
||||
# Complex methods
|
||||
complex_methods = [
|
||||
f
|
||||
for f in self.functions
|
||||
if f.complexity_metrics and f.complexity_metrics.cyclomatic_complexity > 10
|
||||
]
|
||||
if complex_methods:
|
||||
smells.append(
|
||||
f"Complex methods detected: {len(complex_methods)} methods with complexity > 10"
|
||||
)
|
||||
|
||||
# Many parameters
|
||||
for func in self.functions:
|
||||
try:
|
||||
tree = ast.parse(func.content)
|
||||
for node in ast.walk(tree):
|
||||
if isinstance(node, ast.FunctionDef) and len(node.args.args) > 5:
|
||||
smells.append(
|
||||
f"Method with many parameters: {func.function_name} ({len(node.args.args)} parameters)"
|
||||
)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return smells
|
||||
268
src/quality/core/base.py
Normal file
268
src/quality/core/base.py
Normal file
@@ -0,0 +1,268 @@
|
||||
"""Base classes and interfaces for code quality analysis."""
|
||||
|
||||
import hashlib
|
||||
from abc import ABC, abstractmethod
|
||||
from dataclasses import dataclass, field
|
||||
from enum import Enum
|
||||
from pathlib import Path
|
||||
from typing import Protocol
|
||||
|
||||
|
||||
class SimilarityAlgorithm(Protocol):
|
||||
"""Protocol for similarity calculation algorithms."""
|
||||
|
||||
def calculate(self, text1: str, text2: str) -> float:
|
||||
"""Calculate similarity between two text strings.
|
||||
|
||||
Args:
|
||||
text1: First text string
|
||||
text2: Second text string
|
||||
|
||||
Returns:
|
||||
Similarity score between 0.0 and 1.0
|
||||
"""
|
||||
...
|
||||
|
||||
|
||||
class RefactoringType(Enum):
|
||||
"""Types of refactoring suggestions."""
|
||||
|
||||
EXTRACT_METHOD = "extract_method"
|
||||
EXTRACT_CLASS = "extract_class"
|
||||
INTRODUCE_PARAMETER_OBJECT = "introduce_parameter_object"
|
||||
TEMPLATE_METHOD = "template_method"
|
||||
CONSOLIDATE_CONDITIONAL = "consolidate_conditional"
|
||||
REMOVE_DUPLICATE = "remove_duplicate"
|
||||
|
||||
|
||||
@dataclass
|
||||
class ComplexityMetrics:
|
||||
"""Code complexity metrics."""
|
||||
|
||||
cyclomatic_complexity: int
|
||||
cognitive_complexity: int | None = None
|
||||
halstead_difficulty: float | None = None
|
||||
halstead_effort: float | None = None
|
||||
maintainability_index: float | None = None
|
||||
|
||||
@property
|
||||
def complexity_score(self) -> float:
|
||||
"""Calculate overall complexity score."""
|
||||
score = self.cyclomatic_complexity
|
||||
if self.cognitive_complexity:
|
||||
score += self.cognitive_complexity * 0.5
|
||||
if self.halstead_difficulty:
|
||||
score += self.halstead_difficulty * 0.3
|
||||
return score
|
||||
|
||||
|
||||
@dataclass
|
||||
class CodeBlock:
|
||||
"""Represents a block of code with metadata."""
|
||||
|
||||
file_path: str
|
||||
start_line: int
|
||||
end_line: int
|
||||
content: str
|
||||
content_hash: str = field(init=False)
|
||||
normalized_content: str = field(init=False)
|
||||
complexity_metrics: ComplexityMetrics | None = None
|
||||
function_name: str | None = None
|
||||
class_name: str | None = None
|
||||
|
||||
def __post_init__(self) -> None:
|
||||
"""Initialize computed fields."""
|
||||
self.content_hash = hashlib.md5(self.content.encode()).hexdigest()
|
||||
self.normalized_content = self._normalize_content()
|
||||
|
||||
def _normalize_content(self) -> str:
|
||||
"""Normalize content for comparison with enhanced identifier abstraction."""
|
||||
import re
|
||||
|
||||
content = self.content
|
||||
|
||||
# Remove comments (Python, JavaScript, TypeScript)
|
||||
content = re.sub(r"#.*$", "", content, flags=re.MULTILINE) # Python comments
|
||||
content = re.sub(r"//.*$", "", content, flags=re.MULTILINE) # JS/TS single-line
|
||||
content = re.sub(r"/\*.*?\*/", "", content, flags=re.DOTALL) # JS/TS multi-line
|
||||
|
||||
# Remove string literals but preserve their structure
|
||||
content = re.sub(r'""".*?"""', '"""STRING"""', content, flags=re.DOTALL)
|
||||
content = re.sub(r"'''.*?'''", "'''STRING'''", content, flags=re.DOTALL)
|
||||
content = re.sub(r'"[^"]*"', '"STRING"', content)
|
||||
content = re.sub(r"'[^']*'", "'STRING'", content)
|
||||
|
||||
# Normalize numeric literals
|
||||
content = re.sub(r'\b\d+\.?\d*\b', 'NUM', content)
|
||||
|
||||
# Abstract variable names while preserving keywords and operators
|
||||
python_keywords = {
|
||||
'def', 'class', 'if', 'else', 'elif', 'for', 'while', 'try', 'except',
|
||||
'finally', 'with', 'as', 'import', 'from', 'return', 'yield', 'pass',
|
||||
'break', 'continue', 'and', 'or', 'not', 'in', 'is', 'lambda', 'None',
|
||||
'True', 'False', 'self', 'cls'
|
||||
}
|
||||
|
||||
# Split into tokens and normalize identifiers
|
||||
tokens = re.findall(r'\b\w+\b|[^\w\s]', content)
|
||||
normalized_tokens = []
|
||||
|
||||
for token in tokens:
|
||||
if token.lower() in python_keywords or not re.match(r'^[a-zA-Z_]\w*$', token):
|
||||
# Keep keywords and non-identifiers as-is
|
||||
normalized_tokens.append(token)
|
||||
else:
|
||||
# Abstract user-defined identifiers
|
||||
normalized_tokens.append('VAR')
|
||||
|
||||
content = ' '.join(normalized_tokens)
|
||||
|
||||
# Remove extra whitespace
|
||||
content = re.sub(r"\s+", " ", content)
|
||||
return content.strip()
|
||||
|
||||
@property
|
||||
def lines_count(self) -> int:
|
||||
"""Get number of lines in the code block."""
|
||||
return self.end_line - self.start_line + 1
|
||||
|
||||
@property
|
||||
def relative_path(self) -> str:
|
||||
"""Get relative path from current working directory."""
|
||||
try:
|
||||
return str(Path(self.file_path).relative_to(Path.cwd()))
|
||||
except ValueError:
|
||||
return self.file_path
|
||||
|
||||
|
||||
@dataclass
|
||||
class RefactoringSuggestion:
|
||||
"""Suggestion for refactoring duplicated code."""
|
||||
|
||||
refactoring_type: RefactoringType
|
||||
description: str
|
||||
affected_blocks: list[CodeBlock]
|
||||
effort_estimate: float # Hours
|
||||
risk_score: float # 0-1, higher = riskier
|
||||
expected_benefit: str
|
||||
implementation_steps: list[str] = field(default_factory=list)
|
||||
|
||||
@property
|
||||
def priority_score(self) -> float:
|
||||
"""Calculate priority based on benefit vs effort and risk."""
|
||||
lines_saved = sum(block.lines_count for block in self.affected_blocks) - 1
|
||||
complexity_reduction = sum(
|
||||
block.complexity_metrics.complexity_score
|
||||
for block in self.affected_blocks
|
||||
if block.complexity_metrics
|
||||
)
|
||||
|
||||
benefit_score = (lines_saved * 0.1) + (complexity_reduction * 0.5)
|
||||
cost_score = self.effort_estimate + (self.risk_score * 2)
|
||||
|
||||
return benefit_score / max(cost_score, 0.1)
|
||||
|
||||
|
||||
@dataclass
|
||||
class DuplicateMatch:
|
||||
"""Represents a duplicate code match."""
|
||||
|
||||
blocks: list[CodeBlock]
|
||||
similarity_score: float
|
||||
match_type: str # 'exact', 'similar', 'structural', 'semantic'
|
||||
description: str
|
||||
complexity_score: float = 0.0
|
||||
priority_score: float = 0.0
|
||||
refactoring_suggestion: RefactoringSuggestion | None = None
|
||||
|
||||
def __post_init__(self) -> None:
|
||||
"""Calculate derived scores."""
|
||||
if self.blocks:
|
||||
# Average complexity score
|
||||
complexity_scores = [
|
||||
block.complexity_metrics.complexity_score
|
||||
for block in self.blocks
|
||||
if block.complexity_metrics
|
||||
]
|
||||
self.complexity_score = (
|
||||
sum(complexity_scores) / len(complexity_scores)
|
||||
if complexity_scores
|
||||
else 0.0
|
||||
)
|
||||
|
||||
# Calculate priority: similarity × complexity × lines
|
||||
total_lines = sum(block.lines_count for block in self.blocks)
|
||||
self.priority_score = (
|
||||
self.similarity_score * self.complexity_score * (total_lines / 10)
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class AnalysisResult:
|
||||
"""Result of code quality analysis."""
|
||||
|
||||
duplicate_matches: list[DuplicateMatch]
|
||||
total_files_analyzed: int
|
||||
total_lines_analyzed: int
|
||||
total_duplicated_lines: int
|
||||
analysis_duration: float # seconds
|
||||
summary_stats: dict[str, int | float] = field(default_factory=dict)
|
||||
refactoring_suggestions: list[RefactoringSuggestion] = field(default_factory=list)
|
||||
|
||||
@property
|
||||
def duplication_percentage(self) -> float:
|
||||
"""Calculate percentage of duplicated lines."""
|
||||
if self.total_lines_analyzed == 0:
|
||||
return 0.0
|
||||
return (self.total_duplicated_lines / self.total_lines_analyzed) * 100
|
||||
|
||||
@property
|
||||
def high_priority_matches(self) -> list[DuplicateMatch]:
|
||||
"""Get matches with high priority scores."""
|
||||
return [match for match in self.duplicate_matches if match.priority_score > 5.0]
|
||||
|
||||
|
||||
class CodeAnalyzer(ABC):
|
||||
"""Abstract base class for code analyzers."""
|
||||
|
||||
@abstractmethod
|
||||
def analyze(self, code: str, file_path: str) -> AnalysisResult:
|
||||
"""Analyze code and return results."""
|
||||
...
|
||||
|
||||
@abstractmethod
|
||||
def supports_language(self, language: str) -> bool:
|
||||
"""Check if analyzer supports the given language."""
|
||||
...
|
||||
|
||||
|
||||
class QualityMetricsCalculator(ABC):
|
||||
"""Abstract base class for quality metrics calculation."""
|
||||
|
||||
@abstractmethod
|
||||
def calculate_complexity(self, code: str) -> ComplexityMetrics:
|
||||
"""Calculate complexity metrics for code."""
|
||||
...
|
||||
|
||||
@abstractmethod
|
||||
def calculate_maintainability_index(self, code: str) -> float:
|
||||
"""Calculate maintainability index."""
|
||||
...
|
||||
|
||||
|
||||
@dataclass
|
||||
class AnalysisConfig:
|
||||
"""Configuration for analysis."""
|
||||
|
||||
min_lines: int = 5
|
||||
min_tokens: int = 50
|
||||
similarity_threshold: float = 0.8
|
||||
complexity_threshold: int = 10
|
||||
languages: set[str] = field(default_factory=lambda: {"python"})
|
||||
exclude_patterns: list[str] = field(default_factory=list)
|
||||
include_patterns: list[str] = field(default_factory=lambda: ["**/*.py"])
|
||||
enable_semantic_analysis: bool = True
|
||||
enable_refactoring_suggestions: bool = True
|
||||
max_files: int | None = None
|
||||
parallel_processing: bool = True
|
||||
cache_enabled: bool = True
|
||||
131
src/quality/core/cache.py
Normal file
131
src/quality/core/cache.py
Normal file
@@ -0,0 +1,131 @@
|
||||
"""Caching system for performance optimization."""
|
||||
|
||||
import hashlib
|
||||
import pickle
|
||||
from pathlib import Path
|
||||
from typing import Any, Generic, TypeVar
|
||||
|
||||
from .base import CodeBlock
|
||||
|
||||
T = TypeVar("T")
|
||||
|
||||
|
||||
class CacheManager(Generic[T]):
|
||||
"""Generic cache manager for storing analysis results."""
|
||||
|
||||
def __init__(self, cache_dir: Path = Path(".quality_cache")):
|
||||
self.cache_dir = cache_dir
|
||||
self.cache_dir.mkdir(exist_ok=True)
|
||||
self.memory_cache: dict[str, T] = {}
|
||||
|
||||
def _get_cache_key(self, data: str, prefix: str = "") -> str:
|
||||
"""Generate cache key from data."""
|
||||
hash_obj = hashlib.sha256(data.encode())
|
||||
return f"{prefix}_{hash_obj.hexdigest()[:16]}"
|
||||
|
||||
def get(self, key: str, use_memory: bool = True) -> T | None:
|
||||
"""Get item from cache."""
|
||||
# Check memory cache first
|
||||
if use_memory and key in self.memory_cache:
|
||||
return self.memory_cache[key]
|
||||
|
||||
# Check disk cache
|
||||
cache_file = self.cache_dir / f"{key}.pickle"
|
||||
if cache_file.exists():
|
||||
try:
|
||||
with open(cache_file, "rb") as f:
|
||||
data = pickle.load(f)
|
||||
if use_memory:
|
||||
self.memory_cache[key] = data
|
||||
return data
|
||||
except Exception:
|
||||
# If cache is corrupted, remove it
|
||||
cache_file.unlink(missing_ok=True)
|
||||
|
||||
return None
|
||||
|
||||
def set(self, key: str, value: T, use_memory: bool = True) -> None:
|
||||
"""Store item in cache."""
|
||||
if use_memory:
|
||||
self.memory_cache[key] = value
|
||||
|
||||
# Store on disk
|
||||
cache_file = self.cache_dir / f"{key}.pickle"
|
||||
try:
|
||||
with open(cache_file, "wb") as f:
|
||||
pickle.dump(value, f)
|
||||
except Exception:
|
||||
pass # Fail silently if can't write to disk
|
||||
|
||||
def get_file_hash(self, file_path: Path) -> str:
|
||||
"""Get hash of file contents and modification time."""
|
||||
try:
|
||||
stat = file_path.stat()
|
||||
content_hash = hashlib.md5(file_path.read_bytes()).hexdigest()
|
||||
except Exception:
|
||||
return ""
|
||||
else:
|
||||
return f"{content_hash}_{stat.st_mtime}"
|
||||
|
||||
def is_file_cached(self, file_path: Path) -> bool:
|
||||
"""Check if file analysis is cached and up-to-date."""
|
||||
file_hash = self.get_file_hash(file_path)
|
||||
if not file_hash:
|
||||
return False
|
||||
|
||||
cache_key = self._get_cache_key(str(file_path), "file")
|
||||
cached_result = self.get(f"{cache_key}_meta")
|
||||
|
||||
return cached_result == file_hash
|
||||
|
||||
def cache_file_analysis(self, file_path: Path, blocks: list[CodeBlock]) -> None:
|
||||
"""Cache file analysis results."""
|
||||
file_hash = self.get_file_hash(file_path)
|
||||
cache_key = self._get_cache_key(str(file_path), "file")
|
||||
|
||||
# Cache the blocks
|
||||
self.set(cache_key, blocks)
|
||||
# Cache the file metadata
|
||||
self.set(f"{cache_key}_meta", file_hash)
|
||||
|
||||
def get_cached_file_analysis(self, file_path: Path) -> list[CodeBlock] | None:
|
||||
"""Get cached file analysis if up-to-date."""
|
||||
if not self.is_file_cached(file_path):
|
||||
return None
|
||||
|
||||
cache_key = self._get_cache_key(str(file_path), "file")
|
||||
return self.get(cache_key)
|
||||
|
||||
def clear(self) -> None:
|
||||
"""Clear all caches."""
|
||||
self.memory_cache.clear()
|
||||
|
||||
# Clear disk cache
|
||||
for cache_file in self.cache_dir.glob("*.pickle"):
|
||||
cache_file.unlink(missing_ok=True)
|
||||
|
||||
def clear_old_entries(self, max_age_days: int = 7) -> None:
|
||||
"""Clear cache entries older than specified days."""
|
||||
import time
|
||||
|
||||
max_age_seconds = max_age_days * 24 * 3600
|
||||
current_time = time.time()
|
||||
|
||||
for cache_file in self.cache_dir.glob("*.pickle"):
|
||||
try:
|
||||
if (current_time - cache_file.stat().st_mtime) > max_age_seconds:
|
||||
cache_file.unlink()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
def get_cache_stats(self) -> dict[str, Any]:
|
||||
"""Get cache statistics."""
|
||||
disk_files = list(self.cache_dir.glob("*.pickle"))
|
||||
total_size = sum(f.stat().st_size for f in disk_files if f.exists())
|
||||
|
||||
return {
|
||||
"memory_items": len(self.memory_cache),
|
||||
"disk_files": len(disk_files),
|
||||
"total_size_mb": total_size / (1024 * 1024),
|
||||
"cache_dir": str(self.cache_dir),
|
||||
}
|
||||
354
src/quality/core/exceptions.py
Normal file
354
src/quality/core/exceptions.py
Normal file
@@ -0,0 +1,354 @@
|
||||
"""Exception handling system for quality analysis."""
|
||||
|
||||
import fnmatch
|
||||
import re
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from ..config.schemas import (
|
||||
ExceptionRule,
|
||||
QualityConfig,
|
||||
)
|
||||
|
||||
|
||||
class ExceptionFilter:
|
||||
"""Filters analysis results based on configured exception rules."""
|
||||
|
||||
def __init__(self, config: QualityConfig):
|
||||
self.config = config
|
||||
self.exceptions_config = config.exceptions
|
||||
self.active_rules = self._get_active_rules()
|
||||
|
||||
def _get_active_rules(self) -> list[ExceptionRule]:
|
||||
"""Get currently active exception rules."""
|
||||
if not self.exceptions_config.enabled:
|
||||
return []
|
||||
|
||||
active_rules = []
|
||||
current_date = datetime.now().date()
|
||||
|
||||
for rule in self.exceptions_config.rules:
|
||||
if not rule.enabled:
|
||||
continue
|
||||
|
||||
# Check if rule has expired
|
||||
if rule.expires:
|
||||
try:
|
||||
expire_date = datetime.strptime(rule.expires, "%Y-%m-%d").date()
|
||||
if current_date > expire_date:
|
||||
continue
|
||||
except ValueError:
|
||||
# Invalid date format, skip rule
|
||||
continue
|
||||
|
||||
active_rules.append(rule)
|
||||
|
||||
return active_rules
|
||||
|
||||
def should_suppress_issue(
|
||||
self,
|
||||
analysis_type: str,
|
||||
issue_type: str | None,
|
||||
file_path: str,
|
||||
line_number: int,
|
||||
line_content: str = "",
|
||||
) -> tuple[bool, str | None]:
|
||||
"""
|
||||
Check if an issue should be suppressed.
|
||||
|
||||
Returns:
|
||||
(should_suppress, reason)
|
||||
"""
|
||||
# Check global file/directory exclusions first
|
||||
if self._is_globally_excluded(file_path):
|
||||
return True, "File/directory globally excluded"
|
||||
|
||||
# Check exception rules
|
||||
for rule in self.active_rules:
|
||||
if self._rule_matches(
|
||||
rule, analysis_type, issue_type, file_path, line_number, line_content
|
||||
):
|
||||
return (
|
||||
True,
|
||||
rule.reason or f"Matched exception rule: {rule.analysis_type}",
|
||||
)
|
||||
|
||||
return False, None
|
||||
|
||||
def _is_globally_excluded(self, file_path: str) -> bool:
|
||||
"""Check if file is globally excluded."""
|
||||
normalized_path = str(Path(file_path).resolve())
|
||||
|
||||
# Check excluded files
|
||||
for pattern in self.exceptions_config.exclude_files:
|
||||
if fnmatch.fnmatch(normalized_path, pattern) or fnmatch.fnmatch(
|
||||
file_path, pattern
|
||||
):
|
||||
return True
|
||||
|
||||
# Check excluded directories
|
||||
for pattern in self.exceptions_config.exclude_directories:
|
||||
if fnmatch.fnmatch(str(Path(file_path).parent), pattern):
|
||||
return True
|
||||
# Also check if any parent directory matches
|
||||
path_parts = Path(file_path).parts
|
||||
for i in range(len(path_parts)):
|
||||
partial_path = "/".join(path_parts[: i + 1])
|
||||
if fnmatch.fnmatch(partial_path, pattern):
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def _rule_matches(
|
||||
self,
|
||||
rule: ExceptionRule,
|
||||
analysis_type: str,
|
||||
issue_type: str | None,
|
||||
file_path: str,
|
||||
line_number: int,
|
||||
line_content: str,
|
||||
) -> bool:
|
||||
"""Check if a rule matches the current issue."""
|
||||
# Check analysis type
|
||||
if rule.analysis_type != analysis_type and rule.analysis_type != "*":
|
||||
return False
|
||||
|
||||
# Check issue type if specified
|
||||
if rule.issue_type and rule.issue_type != issue_type:
|
||||
return False
|
||||
|
||||
# Check file patterns
|
||||
if rule.file_patterns:
|
||||
file_matches = False
|
||||
for pattern in rule.file_patterns:
|
||||
if fnmatch.fnmatch(file_path, pattern) or fnmatch.fnmatch(
|
||||
str(Path(file_path).name), pattern
|
||||
):
|
||||
file_matches = True
|
||||
break
|
||||
if not file_matches:
|
||||
return False
|
||||
|
||||
# Check line patterns
|
||||
if rule.line_patterns and line_content:
|
||||
line_matches = False
|
||||
for pattern in rule.line_patterns:
|
||||
if re.search(pattern, line_content):
|
||||
line_matches = True
|
||||
break
|
||||
if not line_matches:
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
def filter_issues(
|
||||
self,
|
||||
analysis_type: str,
|
||||
issues: list[Any],
|
||||
get_file_path_fn: callable = None,
|
||||
get_line_number_fn: callable = None,
|
||||
get_line_content_fn: callable = None,
|
||||
get_issue_type_fn: callable = None,
|
||||
) -> list[Any]:
|
||||
"""
|
||||
Filter a list of issues based on exception rules.
|
||||
|
||||
Args:
|
||||
analysis_type: Type of analysis ("complexity", "duplicates", etc.)
|
||||
issues: List of issues to filter
|
||||
get_file_path_fn: Function to extract file path from issue
|
||||
get_line_number_fn: Function to extract line number from issue
|
||||
get_line_content_fn: Function to extract line content from issue
|
||||
get_issue_type_fn: Function to extract issue type from issue
|
||||
"""
|
||||
if not self.exceptions_config.enabled or not issues:
|
||||
return issues
|
||||
|
||||
filtered_issues = []
|
||||
|
||||
for issue in issues:
|
||||
# Extract issue details
|
||||
file_path = (
|
||||
get_file_path_fn(issue)
|
||||
if get_file_path_fn
|
||||
else getattr(issue, "file_path", "")
|
||||
)
|
||||
line_number = (
|
||||
get_line_number_fn(issue)
|
||||
if get_line_number_fn
|
||||
else getattr(issue, "line_number", 0)
|
||||
)
|
||||
line_content = (
|
||||
get_line_content_fn(issue)
|
||||
if get_line_content_fn
|
||||
else getattr(issue, "line_content", "")
|
||||
)
|
||||
issue_type = (
|
||||
get_issue_type_fn(issue)
|
||||
if get_issue_type_fn
|
||||
else getattr(issue, "issue_type", None)
|
||||
)
|
||||
|
||||
should_suppress, reason = self.should_suppress_issue(
|
||||
analysis_type, issue_type, file_path, line_number, line_content
|
||||
)
|
||||
|
||||
if not should_suppress:
|
||||
filtered_issues.append(issue)
|
||||
elif self.config.debug:
|
||||
print(
|
||||
f"Suppressed {analysis_type} issue in {file_path}:{line_number} - {reason}"
|
||||
)
|
||||
|
||||
return filtered_issues
|
||||
|
||||
def get_suppression_summary(self) -> dict[str, Any]:
|
||||
"""Get summary of active suppressions."""
|
||||
return {
|
||||
"enabled": self.exceptions_config.enabled,
|
||||
"active_rules": len(self.active_rules),
|
||||
"global_exclusions": {
|
||||
"files": len(self.exceptions_config.exclude_files),
|
||||
"directories": len(self.exceptions_config.exclude_directories),
|
||||
},
|
||||
"rules_by_type": self._summarize_rules_by_type(),
|
||||
}
|
||||
|
||||
def _summarize_rules_by_type(self) -> dict[str, int]:
|
||||
"""Summarize rules by analysis type."""
|
||||
summary = {}
|
||||
for rule in self.active_rules:
|
||||
analysis_type = rule.analysis_type
|
||||
summary[analysis_type] = summary.get(analysis_type, 0) + 1
|
||||
return summary
|
||||
|
||||
|
||||
def create_example_exceptions_config() -> dict[str, Any]:
|
||||
"""Create an example exceptions configuration."""
|
||||
return {
|
||||
"exceptions": {
|
||||
"enabled": True,
|
||||
"exclude_files": [
|
||||
"*/tests/*",
|
||||
"*/test_*",
|
||||
"*/__pycache__/*",
|
||||
"*/migrations/*",
|
||||
],
|
||||
"exclude_directories": [
|
||||
"*/venv/*",
|
||||
"*/.venv/*",
|
||||
"*/node_modules/*",
|
||||
"*/.git/*",
|
||||
],
|
||||
"rules": [
|
||||
{
|
||||
"analysis_type": "complexity",
|
||||
"issue_type": "high_complexity",
|
||||
"file_patterns": ["*/legacy/*", "*/third_party/*"],
|
||||
"reason": "Legacy code with known complexity - migration planned",
|
||||
},
|
||||
{
|
||||
"analysis_type": "modernization",
|
||||
"issue_type": "pydantic_v1_pattern",
|
||||
"file_patterns": ["*/compatibility/*"],
|
||||
"line_patterns": ["# pydantic v1 required", "# TODO: migrate"],
|
||||
"reason": "Intentional Pydantic v1 usage for compatibility",
|
||||
},
|
||||
{
|
||||
"analysis_type": "modernization",
|
||||
"issue_type": "legacy_typing_import",
|
||||
"file_patterns": ["*/external/*"],
|
||||
"reason": "External library compatibility requirements",
|
||||
},
|
||||
{
|
||||
"analysis_type": "duplicates",
|
||||
"file_patterns": ["*/templates/*", "*/generated/*"],
|
||||
"reason": "Generated or template code - duplication expected",
|
||||
},
|
||||
{
|
||||
"analysis_type": "code_smells",
|
||||
"issue_type": "long_method",
|
||||
"file_patterns": ["*/parsers/*"],
|
||||
"reason": "Parser methods intentionally long for readability",
|
||||
"expires": "2024-12-31",
|
||||
},
|
||||
],
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
def create_exceptions_config_template() -> str:
|
||||
"""Create a YAML template for exceptions configuration."""
|
||||
return """# Quality Analysis Exceptions Configuration
|
||||
# This file allows you to suppress specific analysis results
|
||||
|
||||
exceptions:
|
||||
enabled: true
|
||||
|
||||
# Global file and directory exclusions
|
||||
exclude_files:
|
||||
- "*/tests/*"
|
||||
- "*/test_*"
|
||||
- "*/__pycache__/*"
|
||||
- "*/migrations/*"
|
||||
|
||||
exclude_directories:
|
||||
- "*/venv/*"
|
||||
- "*/.venv/*"
|
||||
- "*/node_modules/*"
|
||||
- "*/.git/*"
|
||||
|
||||
# Specific exception rules
|
||||
rules:
|
||||
# Example: Suppress complexity issues in legacy code
|
||||
- analysis_type: "complexity"
|
||||
issue_type: "high_complexity"
|
||||
file_patterns:
|
||||
- "*/legacy/*"
|
||||
- "*/third_party/*"
|
||||
reason: "Legacy code with known complexity - migration planned"
|
||||
|
||||
# Example: Allow intentional Pydantic v1 usage
|
||||
- analysis_type: "modernization"
|
||||
issue_type: "pydantic_v1_pattern"
|
||||
file_patterns:
|
||||
- "*/compatibility/*"
|
||||
line_patterns:
|
||||
- "# pydantic v1 required"
|
||||
- "# TODO: migrate"
|
||||
reason: "Intentional Pydantic v1 usage for compatibility"
|
||||
|
||||
# Example: Suppress typing imports for external compatibility
|
||||
- analysis_type: "modernization"
|
||||
issue_type: "legacy_typing_import"
|
||||
file_patterns:
|
||||
- "*/external/*"
|
||||
reason: "External library compatibility requirements"
|
||||
|
||||
# Example: Allow duplicates in generated/template code
|
||||
- analysis_type: "duplicates"
|
||||
file_patterns:
|
||||
- "*/templates/*"
|
||||
- "*/generated/*"
|
||||
reason: "Generated or template code - duplication expected"
|
||||
|
||||
# Example: Temporary suppression with expiration
|
||||
- analysis_type: "code_smells"
|
||||
issue_type: "long_method"
|
||||
file_patterns:
|
||||
- "*/parsers/*"
|
||||
reason: "Parser methods intentionally long for readability"
|
||||
expires: "2024-12-31"
|
||||
enabled: true
|
||||
|
||||
# Analysis Types:
|
||||
# - "complexity" - Code complexity issues
|
||||
# - "duplicates" - Duplicate code detection
|
||||
# - "modernization" - Modern Python pattern suggestions
|
||||
# - "code_smells" - General code smell detection
|
||||
|
||||
# Issue Types vary by analysis - see CLI output for specific types
|
||||
# Use "*" for analysis_type to match all analysis types
|
||||
# Leave issue_type empty to match all issues of that analysis type
|
||||
"""
|
||||
9
src/quality/detection/__init__.py
Normal file
9
src/quality/detection/__init__.py
Normal file
@@ -0,0 +1,9 @@
|
||||
"""Code duplicate detection engine."""
|
||||
|
||||
from .engine import DuplicateDetectionEngine
|
||||
from .matcher import DuplicateMatcher
|
||||
|
||||
__all__ = [
|
||||
"DuplicateDetectionEngine",
|
||||
"DuplicateMatcher",
|
||||
]
|
||||
420
src/quality/detection/engine.py
Normal file
420
src/quality/detection/engine.py
Normal file
@@ -0,0 +1,420 @@
|
||||
"""Enhanced duplicate detection engine with multiple algorithms."""
|
||||
|
||||
import ast
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from ..complexity.analyzer import ComplexityAnalyzer
|
||||
from ..config.schemas import QualityConfig
|
||||
from ..core.ast_analyzer import ASTAnalyzer
|
||||
from ..core.base import CodeBlock, DuplicateMatch
|
||||
from .matcher import DuplicateMatcher
|
||||
from ..similarity.base import SimilarityCalculator
|
||||
from ..similarity.lsh import LSHDuplicateDetector
|
||||
|
||||
|
||||
class DuplicateDetectionEngine:
|
||||
"""Advanced duplicate detection engine with configurable algorithms."""
|
||||
|
||||
def __init__(self, config: QualityConfig | None = None):
|
||||
self.config = config or QualityConfig()
|
||||
self.detection_config = self.config.detection
|
||||
|
||||
# Initialize exception filter
|
||||
from ..core.exceptions import ExceptionFilter
|
||||
|
||||
self.exception_filter = ExceptionFilter(self.config)
|
||||
|
||||
# Initialize components
|
||||
self.ast_analyzer = ASTAnalyzer()
|
||||
self.complexity_analyzer = ComplexityAnalyzer(
|
||||
self.config.complexity, self.config
|
||||
)
|
||||
self.similarity_calculator = self._create_similarity_calculator()
|
||||
self.matcher = DuplicateMatcher(
|
||||
self.similarity_calculator, self.detection_config
|
||||
)
|
||||
|
||||
# LSH for large-scale detection
|
||||
self.lsh_detector = None
|
||||
if self.detection_config.use_lsh:
|
||||
self.lsh_detector = LSHDuplicateDetector(
|
||||
threshold=self.detection_config.similarity_threshold,
|
||||
num_perm=128,
|
||||
bands=16,
|
||||
rows=8,
|
||||
)
|
||||
|
||||
def detect_duplicates_in_files(
|
||||
self, file_paths: list[Path], max_workers: int | None = None
|
||||
) -> list[DuplicateMatch]:
|
||||
"""Detect duplicates across multiple files."""
|
||||
# Extract code blocks from all files
|
||||
all_blocks = []
|
||||
|
||||
for file_path in file_paths:
|
||||
try:
|
||||
blocks = self.ast_analyzer.extract_code_blocks(file_path)
|
||||
# Filter blocks by minimum size
|
||||
filtered_blocks = [
|
||||
block
|
||||
for block in blocks
|
||||
if (block.end_line - block.start_line + 1)
|
||||
>= self.detection_config.min_lines
|
||||
and len(block.content.split()) >= self.detection_config.min_tokens
|
||||
]
|
||||
all_blocks.extend(filtered_blocks)
|
||||
except Exception:
|
||||
# Skip files that can't be parsed
|
||||
continue
|
||||
|
||||
return self.detect_duplicates_in_blocks(all_blocks)
|
||||
|
||||
def detect_duplicates_in_blocks(
|
||||
self, blocks: list[CodeBlock]
|
||||
) -> list[DuplicateMatch]:
|
||||
"""Detect duplicates in a list of code blocks."""
|
||||
if not blocks:
|
||||
return []
|
||||
|
||||
# Use LSH for large datasets
|
||||
if (
|
||||
self.detection_config.use_lsh
|
||||
and len(blocks) >= self.detection_config.lsh_threshold
|
||||
and self.lsh_detector
|
||||
):
|
||||
return self._detect_with_lsh(blocks)
|
||||
else:
|
||||
return self._detect_with_similarity(blocks)
|
||||
|
||||
def find_duplicates_of_block(
|
||||
self, target_block: CodeBlock, candidate_blocks: list[CodeBlock]
|
||||
) -> list[DuplicateMatch]:
|
||||
"""Find duplicates of a specific code block."""
|
||||
matches = []
|
||||
|
||||
for candidate in candidate_blocks:
|
||||
if candidate == target_block: # Skip self
|
||||
continue
|
||||
|
||||
similarity = self.similarity_calculator.calculate_similarity(
|
||||
target_block, candidate
|
||||
)
|
||||
|
||||
if similarity >= self.detection_config.similarity_threshold:
|
||||
# Calculate complexity metrics
|
||||
target_complexity = self.complexity_analyzer.analyze_code(
|
||||
target_block.content
|
||||
)
|
||||
candidate_complexity = self.complexity_analyzer.analyze_code(
|
||||
candidate.content
|
||||
)
|
||||
|
||||
match_type = "exact" if similarity >= 0.95 else "similar"
|
||||
match = DuplicateMatch(
|
||||
blocks=[target_block, candidate],
|
||||
similarity_score=similarity,
|
||||
match_type=match_type,
|
||||
description=f"{match_type.title()} duplicate detected (similarity: {similarity:.3f})",
|
||||
complexity_score=max(
|
||||
target_complexity.get_overall_score(),
|
||||
candidate_complexity.get_overall_score(),
|
||||
),
|
||||
priority_score=self._calculate_priority_score(
|
||||
similarity,
|
||||
target_complexity.get_overall_score(),
|
||||
len([target_block, candidate]),
|
||||
),
|
||||
)
|
||||
matches.append(match)
|
||||
|
||||
return matches
|
||||
|
||||
def get_detailed_analysis(self, duplicate_match: DuplicateMatch) -> dict[str, Any]:
|
||||
"""Get detailed analysis of a duplicate match."""
|
||||
if not duplicate_match.blocks:
|
||||
return {}
|
||||
|
||||
# Analyze each block
|
||||
block_analyses = []
|
||||
for block in duplicate_match.blocks:
|
||||
complexity = self.complexity_analyzer.analyze_code(block.content)
|
||||
summary = self.complexity_analyzer.get_complexity_summary(complexity)
|
||||
|
||||
block_analyses.append(
|
||||
{
|
||||
"file_path": str(block.file_path),
|
||||
"line_range": f"{block.start_line}-{block.end_line}",
|
||||
"lines_of_code": block.end_line - block.start_line + 1,
|
||||
"complexity": summary,
|
||||
"content_preview": self._get_content_preview(block.content),
|
||||
}
|
||||
)
|
||||
|
||||
# Calculate similarity breakdown
|
||||
similarity_breakdown = {}
|
||||
if len(duplicate_match.blocks) >= 2:
|
||||
similarity_breakdown = (
|
||||
self.similarity_calculator.calculate_detailed_similarity(
|
||||
duplicate_match.blocks[0], duplicate_match.blocks[1]
|
||||
)
|
||||
)
|
||||
|
||||
# Generate refactoring suggestions
|
||||
suggestions = self._generate_refactoring_suggestions(duplicate_match)
|
||||
|
||||
return {
|
||||
"match_info": {
|
||||
"similarity_score": duplicate_match.similarity_score,
|
||||
"match_type": duplicate_match.match_type,
|
||||
"priority_score": duplicate_match.priority_score,
|
||||
"complexity_score": duplicate_match.complexity_score,
|
||||
},
|
||||
"blocks": block_analyses,
|
||||
"similarity_breakdown": similarity_breakdown,
|
||||
"refactoring_suggestions": suggestions,
|
||||
"estimated_effort": self._estimate_refactoring_effort(duplicate_match),
|
||||
"risk_assessment": self._assess_refactoring_risk(duplicate_match),
|
||||
}
|
||||
|
||||
def _create_similarity_calculator(self) -> SimilarityCalculator:
|
||||
"""Create similarity calculator with configured algorithms."""
|
||||
from ..similarity import (
|
||||
CosineSimilarity,
|
||||
JaccardSimilarity,
|
||||
LevenshteinSimilarity,
|
||||
SemanticSimilarity,
|
||||
StructuralSimilarity,
|
||||
)
|
||||
|
||||
algorithms = []
|
||||
|
||||
for algo_config in self.detection_config.similarity_algorithms:
|
||||
if not algo_config.enabled:
|
||||
continue
|
||||
|
||||
if algo_config.name == "levenshtein":
|
||||
algorithms.append(LevenshteinSimilarity(algo_config))
|
||||
elif algo_config.name == "jaccard":
|
||||
algorithms.append(JaccardSimilarity(algo_config))
|
||||
elif algo_config.name == "cosine":
|
||||
algorithms.append(CosineSimilarity(algo_config))
|
||||
elif algo_config.name == "semantic":
|
||||
algorithms.append(SemanticSimilarity(algo_config))
|
||||
elif algo_config.name == "structural":
|
||||
algorithms.append(StructuralSimilarity(algo_config))
|
||||
|
||||
return SimilarityCalculator(algorithms)
|
||||
|
||||
def _detect_with_lsh(self, blocks: list[CodeBlock]) -> list[DuplicateMatch]:
|
||||
"""Detect duplicates using LSH for performance."""
|
||||
if not self.lsh_detector:
|
||||
return []
|
||||
|
||||
# Add all blocks to LSH index
|
||||
for block in blocks:
|
||||
self.lsh_detector.add_code_block(block)
|
||||
|
||||
# Find duplicate groups
|
||||
duplicate_groups = self.lsh_detector.find_all_duplicates()
|
||||
|
||||
# Convert to DuplicateMatch objects
|
||||
matches = []
|
||||
for group in duplicate_groups:
|
||||
if len(group) < 2:
|
||||
continue
|
||||
|
||||
# Calculate exact similarity for the group
|
||||
representative = group[0]
|
||||
similarities = []
|
||||
|
||||
for other in group[1:]:
|
||||
similarity = self.similarity_calculator.calculate_similarity(
|
||||
representative, other
|
||||
)
|
||||
similarities.append(similarity)
|
||||
|
||||
avg_similarity = (
|
||||
sum(similarities) / len(similarities) if similarities else 0.0
|
||||
)
|
||||
|
||||
# Calculate complexity metrics
|
||||
complexities = []
|
||||
for block in group:
|
||||
complexity = self.complexity_analyzer.analyze_code(block.content)
|
||||
complexities.append(complexity.get_overall_score())
|
||||
|
||||
max_complexity = max(complexities) if complexities else 0.0
|
||||
|
||||
match = DuplicateMatch(
|
||||
blocks=group,
|
||||
similarity_score=avg_similarity,
|
||||
match_type="lsh_cluster",
|
||||
description=f"LSH cluster with {len(group)} blocks (similarity: {avg_similarity:.3f})",
|
||||
complexity_score=max_complexity,
|
||||
priority_score=self._calculate_priority_score(
|
||||
avg_similarity, max_complexity, len(group)
|
||||
),
|
||||
)
|
||||
matches.append(match)
|
||||
|
||||
return self._filter_duplicate_matches(matches)
|
||||
|
||||
def _detect_with_similarity(self, blocks: list[CodeBlock]) -> list[DuplicateMatch]:
|
||||
"""Detect duplicates using similarity algorithms."""
|
||||
matches = self.matcher.find_all_duplicates(blocks)
|
||||
return self._filter_duplicate_matches(matches)
|
||||
|
||||
def _filter_duplicate_matches(
|
||||
self, matches: list[DuplicateMatch]
|
||||
) -> list[DuplicateMatch]:
|
||||
"""Filter duplicate matches based on exception rules."""
|
||||
if not self.exception_filter:
|
||||
return matches
|
||||
|
||||
filtered_matches = []
|
||||
for match in matches:
|
||||
# Check if any block in the match should be suppressed
|
||||
should_suppress_match = False
|
||||
|
||||
for block in match.blocks:
|
||||
should_suppress, reason = self.exception_filter.should_suppress_issue(
|
||||
"duplicates",
|
||||
"duplicate_code",
|
||||
block.file_path,
|
||||
block.start_line,
|
||||
block.content,
|
||||
)
|
||||
if should_suppress:
|
||||
should_suppress_match = True
|
||||
break
|
||||
|
||||
if not should_suppress_match:
|
||||
filtered_matches.append(match)
|
||||
|
||||
return filtered_matches
|
||||
|
||||
def _calculate_priority_score(
|
||||
self, similarity: float, complexity: float, block_count: int
|
||||
) -> float:
|
||||
"""Calculate priority score for refactoring."""
|
||||
# Base score from similarity
|
||||
priority = similarity
|
||||
|
||||
# Boost for high complexity
|
||||
if complexity > 50:
|
||||
priority += 0.2
|
||||
|
||||
# Boost for more duplicates
|
||||
if block_count > 2:
|
||||
priority += 0.1 * (block_count - 2)
|
||||
|
||||
return min(priority, 1.0)
|
||||
|
||||
def _generate_refactoring_suggestions(
|
||||
self, duplicate_match: DuplicateMatch
|
||||
) -> list[str]:
|
||||
"""Generate refactoring suggestions for duplicate code."""
|
||||
suggestions = []
|
||||
|
||||
if len(duplicate_match.blocks) < 2:
|
||||
return suggestions
|
||||
|
||||
first_block = duplicate_match.blocks[0]
|
||||
|
||||
# Analyze code structure
|
||||
try:
|
||||
tree = ast.parse(first_block.content)
|
||||
|
||||
# Check if it's a function
|
||||
has_function = any(
|
||||
isinstance(node, ast.FunctionDef) for node in ast.walk(tree)
|
||||
)
|
||||
has_class = any(isinstance(node, ast.ClassDef) for node in ast.walk(tree))
|
||||
|
||||
if has_function:
|
||||
suggestions.append(
|
||||
"Extract common function into a shared utility module"
|
||||
)
|
||||
suggestions.append(
|
||||
"Consider creating a base function with configurable parameters"
|
||||
)
|
||||
elif has_class:
|
||||
suggestions.append("Extract common class into a base class or mixin")
|
||||
suggestions.append("Consider using inheritance or composition patterns")
|
||||
else:
|
||||
suggestions.append("Extract duplicate code into a reusable function")
|
||||
suggestions.append(
|
||||
"Consider creating a utility module for shared logic"
|
||||
)
|
||||
|
||||
# Complexity-based suggestions
|
||||
if duplicate_match.complexity_score > 60:
|
||||
suggestions.append(
|
||||
"High complexity detected - consider breaking down into smaller functions"
|
||||
)
|
||||
|
||||
# Similarity-based suggestions
|
||||
if duplicate_match.similarity_score > 0.95:
|
||||
suggestions.append(
|
||||
"Nearly identical code - prioritize for immediate refactoring"
|
||||
)
|
||||
elif duplicate_match.similarity_score > 0.8:
|
||||
suggestions.append("Similar code - consider parameterizing differences")
|
||||
|
||||
except SyntaxError:
|
||||
suggestions.append("Extract duplicate code into a reusable component")
|
||||
|
||||
return suggestions
|
||||
|
||||
def _estimate_refactoring_effort(self, duplicate_match: DuplicateMatch) -> str:
|
||||
"""Estimate effort required for refactoring."""
|
||||
if not duplicate_match.blocks:
|
||||
return "Unknown"
|
||||
|
||||
total_lines = sum(
|
||||
block.end_line - block.start_line + 1 for block in duplicate_match.blocks
|
||||
)
|
||||
|
||||
if total_lines < 20:
|
||||
return "Low (1-2 hours)"
|
||||
elif total_lines < 100:
|
||||
return "Medium (0.5-1 day)"
|
||||
elif total_lines < 500:
|
||||
return "High (1-3 days)"
|
||||
else:
|
||||
return "Very High (1+ weeks)"
|
||||
|
||||
def _assess_refactoring_risk(self, duplicate_match: DuplicateMatch) -> str:
|
||||
"""Assess risk level of refactoring."""
|
||||
risk_factors = []
|
||||
|
||||
if duplicate_match.complexity_score > 70:
|
||||
risk_factors.append("High complexity")
|
||||
|
||||
if len(duplicate_match.blocks) > 5:
|
||||
risk_factors.append("Many duplicates")
|
||||
|
||||
if duplicate_match.similarity_score < 0.85:
|
||||
risk_factors.append("Moderate differences between duplicates")
|
||||
|
||||
# Check if duplicates span multiple files
|
||||
unique_files = len(set(block.file_path for block in duplicate_match.blocks))
|
||||
if unique_files > 3:
|
||||
risk_factors.append("Cross-module dependencies")
|
||||
|
||||
if not risk_factors:
|
||||
return "Low"
|
||||
elif len(risk_factors) <= 2:
|
||||
return "Medium"
|
||||
else:
|
||||
return "High"
|
||||
|
||||
def _get_content_preview(self, content: str, max_lines: int = 5) -> str:
|
||||
"""Get a preview of code content."""
|
||||
lines = content.split("\n")
|
||||
if len(lines) <= max_lines:
|
||||
return content
|
||||
|
||||
preview_lines = lines[:max_lines]
|
||||
return "\n".join(preview_lines) + f"\n... ({len(lines) - max_lines} more lines)"
|
||||
296
src/quality/detection/matcher.py
Normal file
296
src/quality/detection/matcher.py
Normal file
@@ -0,0 +1,296 @@
|
||||
"""Duplicate matching algorithms and strategies."""
|
||||
|
||||
from collections import defaultdict
|
||||
from typing import Any
|
||||
|
||||
from ..config.schemas import DetectionConfig
|
||||
from ..core.base import CodeBlock, DuplicateMatch
|
||||
from ..similarity.base import SimilarityCalculator
|
||||
|
||||
|
||||
class DuplicateMatcher:
|
||||
"""Handles matching logic for finding duplicates."""
|
||||
|
||||
def __init__(
|
||||
self, similarity_calculator: SimilarityCalculator, config: DetectionConfig
|
||||
):
|
||||
self.similarity_calculator = similarity_calculator
|
||||
self.config = config
|
||||
|
||||
def find_all_duplicates(self, blocks: list[CodeBlock]) -> list[DuplicateMatch]:
|
||||
"""Find all duplicate matches in a list of code blocks."""
|
||||
if len(blocks) < 2:
|
||||
return []
|
||||
|
||||
matches = []
|
||||
processed_pairs = set()
|
||||
|
||||
for i, block1 in enumerate(blocks):
|
||||
for j, block2 in enumerate(blocks[i + 1 :], i + 1):
|
||||
pair = (i, j)
|
||||
if pair in processed_pairs:
|
||||
continue
|
||||
|
||||
similarity = self.similarity_calculator.calculate_similarity(
|
||||
block1, block2
|
||||
)
|
||||
|
||||
if similarity >= self.config.similarity_threshold:
|
||||
match_type = "exact" if similarity >= 0.95 else "similar"
|
||||
match = DuplicateMatch(
|
||||
blocks=[block1, block2],
|
||||
similarity_score=similarity,
|
||||
match_type=match_type,
|
||||
description=f"{match_type.title()} match between 2 blocks (similarity: {similarity:.3f})",
|
||||
complexity_score=0.0, # Will be calculated by engine
|
||||
priority_score=similarity,
|
||||
)
|
||||
matches.append(match)
|
||||
processed_pairs.add(pair)
|
||||
|
||||
return self._merge_overlapping_matches(matches)
|
||||
|
||||
def find_duplicates_of_block(
|
||||
self, target_block: CodeBlock, candidate_blocks: list[CodeBlock]
|
||||
) -> list[DuplicateMatch]:
|
||||
"""Find duplicates of a specific block."""
|
||||
matches = []
|
||||
|
||||
for candidate in candidate_blocks:
|
||||
if candidate == target_block:
|
||||
continue
|
||||
|
||||
similarity = self.similarity_calculator.calculate_similarity(
|
||||
target_block, candidate
|
||||
)
|
||||
|
||||
if similarity >= self.config.similarity_threshold:
|
||||
match_type = "exact" if similarity >= 0.95 else "similar"
|
||||
match = DuplicateMatch(
|
||||
blocks=[target_block, candidate],
|
||||
similarity_score=similarity,
|
||||
match_type=match_type,
|
||||
description=f"{match_type.title()} match with target block (similarity: {similarity:.3f})",
|
||||
complexity_score=0.0,
|
||||
priority_score=similarity,
|
||||
)
|
||||
matches.append(match)
|
||||
|
||||
return matches
|
||||
|
||||
def find_similar_blocks(
|
||||
self,
|
||||
target_block: CodeBlock,
|
||||
candidate_blocks: list[CodeBlock],
|
||||
threshold: float,
|
||||
) -> list[tuple[CodeBlock, float]]:
|
||||
"""Find blocks similar to target with custom threshold."""
|
||||
similar_blocks = []
|
||||
|
||||
for candidate in candidate_blocks:
|
||||
if candidate == target_block:
|
||||
continue
|
||||
|
||||
similarity = self.similarity_calculator.calculate_similarity(
|
||||
target_block, candidate
|
||||
)
|
||||
|
||||
if similarity >= threshold:
|
||||
similar_blocks.append((candidate, similarity))
|
||||
|
||||
# Sort by similarity descending
|
||||
similar_blocks.sort(key=lambda x: x[1], reverse=True)
|
||||
return similar_blocks
|
||||
|
||||
def group_similar_blocks(self, blocks: list[CodeBlock]) -> list[list[CodeBlock]]:
|
||||
"""Group blocks into clusters of similar code."""
|
||||
if len(blocks) < 2:
|
||||
return [[block] for block in blocks]
|
||||
|
||||
# Build similarity matrix
|
||||
similarity_matrix = {}
|
||||
for i, block1 in enumerate(blocks):
|
||||
for j, block2 in enumerate(blocks[i + 1 :], i + 1):
|
||||
similarity = self.similarity_calculator.calculate_similarity(
|
||||
block1, block2
|
||||
)
|
||||
similarity_matrix[(i, j)] = similarity
|
||||
|
||||
# Use Union-Find to group similar blocks
|
||||
parent = list(range(len(blocks)))
|
||||
|
||||
def find(x: int) -> int:
|
||||
if parent[x] != x:
|
||||
parent[x] = find(parent[x])
|
||||
return parent[x]
|
||||
|
||||
def union(x: int, y: int) -> None:
|
||||
px, py = find(x), find(y)
|
||||
if px != py:
|
||||
parent[px] = py
|
||||
|
||||
# Union blocks that are similar enough
|
||||
for (i, j), similarity in similarity_matrix.items():
|
||||
if similarity >= self.config.similarity_threshold:
|
||||
union(i, j)
|
||||
|
||||
# Group blocks by their root parent
|
||||
groups = defaultdict(list)
|
||||
for i, block in enumerate(blocks):
|
||||
root = find(i)
|
||||
groups[root].append(block)
|
||||
|
||||
return list(groups.values())
|
||||
|
||||
def calculate_match_confidence(self, match: DuplicateMatch) -> dict[str, Any]:
|
||||
"""Calculate confidence metrics for a duplicate match."""
|
||||
if len(match.blocks) < 2:
|
||||
return {"confidence": 0.0, "factors": []}
|
||||
|
||||
confidence_factors = []
|
||||
total_confidence = 0.0
|
||||
|
||||
# Similarity-based confidence
|
||||
similarity_confidence = match.similarity_score
|
||||
confidence_factors.append(
|
||||
{
|
||||
"factor": "similarity_score",
|
||||
"value": match.similarity_score,
|
||||
"weight": 0.4,
|
||||
"contribution": similarity_confidence * 0.4,
|
||||
}
|
||||
)
|
||||
total_confidence += similarity_confidence * 0.4
|
||||
|
||||
# Length-based confidence (longer matches are more reliable)
|
||||
avg_length = sum(len(block.content) for block in match.blocks) / len(
|
||||
match.blocks
|
||||
)
|
||||
length_confidence = min(avg_length / 1000, 1.0) # Normalize to [0,1]
|
||||
confidence_factors.append(
|
||||
{
|
||||
"factor": "code_length",
|
||||
"value": avg_length,
|
||||
"weight": 0.2,
|
||||
"contribution": length_confidence * 0.2,
|
||||
}
|
||||
)
|
||||
total_confidence += length_confidence * 0.2
|
||||
|
||||
# Token count confidence
|
||||
avg_tokens = sum(len(block.content.split()) for block in match.blocks) / len(
|
||||
match.blocks
|
||||
)
|
||||
token_confidence = min(avg_tokens / 100, 1.0) # Normalize to [0,1]
|
||||
confidence_factors.append(
|
||||
{
|
||||
"factor": "token_count",
|
||||
"value": avg_tokens,
|
||||
"weight": 0.2,
|
||||
"contribution": token_confidence * 0.2,
|
||||
}
|
||||
)
|
||||
total_confidence += token_confidence * 0.2
|
||||
|
||||
# Structural complexity confidence
|
||||
complexity_confidence = min(match.complexity_score / 100, 1.0)
|
||||
confidence_factors.append(
|
||||
{
|
||||
"factor": "complexity_score",
|
||||
"value": match.complexity_score,
|
||||
"weight": 0.2,
|
||||
"contribution": complexity_confidence * 0.2,
|
||||
}
|
||||
)
|
||||
total_confidence += complexity_confidence * 0.2
|
||||
|
||||
return {
|
||||
"confidence": round(total_confidence, 3),
|
||||
"level": self._get_confidence_level(total_confidence),
|
||||
"factors": confidence_factors,
|
||||
}
|
||||
|
||||
def _merge_overlapping_matches(
|
||||
self, matches: list[DuplicateMatch]
|
||||
) -> list[DuplicateMatch]:
|
||||
"""Merge matches that share code blocks."""
|
||||
if len(matches) <= 1:
|
||||
return matches
|
||||
|
||||
# Group matches by overlapping blocks
|
||||
block_to_matches = defaultdict(list)
|
||||
for i, match in enumerate(matches):
|
||||
for block in match.blocks:
|
||||
block_to_matches[id(block)].append(i)
|
||||
|
||||
# Find groups of overlapping matches
|
||||
processed = set()
|
||||
merged_matches = []
|
||||
|
||||
for i, match in enumerate(matches):
|
||||
if i in processed:
|
||||
continue
|
||||
|
||||
# Find all matches that overlap with this one
|
||||
overlapping = {i}
|
||||
to_check = [i]
|
||||
|
||||
while to_check:
|
||||
current = to_check.pop()
|
||||
processed.add(current)
|
||||
|
||||
for block in matches[current].blocks:
|
||||
for match_idx in block_to_matches[id(block)]:
|
||||
if match_idx not in overlapping:
|
||||
overlapping.add(match_idx)
|
||||
to_check.append(match_idx)
|
||||
|
||||
if len(overlapping) == 1:
|
||||
# No overlaps, keep original match
|
||||
merged_matches.append(match)
|
||||
else:
|
||||
# Merge overlapping matches
|
||||
all_blocks = []
|
||||
similarities = []
|
||||
complexity_scores = []
|
||||
|
||||
for idx in overlapping:
|
||||
all_blocks.extend(matches[idx].blocks)
|
||||
similarities.append(matches[idx].similarity_score)
|
||||
complexity_scores.append(matches[idx].complexity_score)
|
||||
|
||||
# Remove duplicate blocks
|
||||
unique_blocks = []
|
||||
seen_blocks = set()
|
||||
for block in all_blocks:
|
||||
block_id = (block.file_path, block.start_line, block.end_line)
|
||||
if block_id not in seen_blocks:
|
||||
unique_blocks.append(block)
|
||||
seen_blocks.add(block_id)
|
||||
|
||||
# Create merged match
|
||||
avg_score = sum(similarities) / len(similarities)
|
||||
merged_match = DuplicateMatch(
|
||||
blocks=unique_blocks,
|
||||
similarity_score=avg_score,
|
||||
match_type="merged_cluster",
|
||||
description=f"Merged cluster with {len(unique_blocks)} blocks (avg similarity: {avg_score:.3f})",
|
||||
complexity_score=max(complexity_scores)
|
||||
if complexity_scores
|
||||
else 0.0,
|
||||
priority_score=avg_score,
|
||||
)
|
||||
merged_matches.append(merged_match)
|
||||
|
||||
return merged_matches
|
||||
|
||||
def _get_confidence_level(self, confidence: float) -> str:
|
||||
"""Get human-readable confidence level."""
|
||||
if confidence >= 0.8:
|
||||
return "High"
|
||||
elif confidence >= 0.6:
|
||||
return "Medium"
|
||||
elif confidence >= 0.4:
|
||||
return "Low"
|
||||
else:
|
||||
return "Very Low"
|
||||
63
src/quality/similarity/__init__.py
Normal file
63
src/quality/similarity/__init__.py
Normal file
@@ -0,0 +1,63 @@
|
||||
"""Similarity algorithms for code analysis."""
|
||||
|
||||
from .base import (
|
||||
BaseSimilarityAlgorithm,
|
||||
SimilarityCalculator,
|
||||
)
|
||||
from .lsh import (
|
||||
BandingLSH,
|
||||
LSHDuplicateDetector,
|
||||
LSHSimilarity,
|
||||
)
|
||||
from .semantic import (
|
||||
FunctionalSimilarity,
|
||||
HashSimilarity,
|
||||
SemanticSimilarity,
|
||||
)
|
||||
from .structural import (
|
||||
DependencySimilarity,
|
||||
IdentifierSimilarity,
|
||||
StructuralSimilarity,
|
||||
TreeEditDistance,
|
||||
)
|
||||
from .text_based import (
|
||||
DifflibSimilarity,
|
||||
LevenshteinSimilarity,
|
||||
LongestCommonSubsequence,
|
||||
NGramSimilarity,
|
||||
)
|
||||
from .token_based import (
|
||||
CosineSimilarity,
|
||||
JaccardSimilarity,
|
||||
ShingleSimilarity,
|
||||
TFIDFSimilarity,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
# Base classes
|
||||
"BaseSimilarityAlgorithm",
|
||||
"SimilarityCalculator",
|
||||
# Text-based algorithms
|
||||
"LevenshteinSimilarity",
|
||||
"DifflibSimilarity",
|
||||
"LongestCommonSubsequence",
|
||||
"NGramSimilarity",
|
||||
# Token-based algorithms
|
||||
"JaccardSimilarity",
|
||||
"CosineSimilarity",
|
||||
"TFIDFSimilarity",
|
||||
"ShingleSimilarity",
|
||||
# Structural algorithms
|
||||
"StructuralSimilarity",
|
||||
"TreeEditDistance",
|
||||
"DependencySimilarity",
|
||||
"IdentifierSimilarity",
|
||||
# Semantic algorithms
|
||||
"SemanticSimilarity",
|
||||
"FunctionalSimilarity",
|
||||
"HashSimilarity",
|
||||
# LSH algorithms
|
||||
"LSHSimilarity",
|
||||
"LSHDuplicateDetector",
|
||||
"BandingLSH",
|
||||
]
|
||||
130
src/quality/similarity/base.py
Normal file
130
src/quality/similarity/base.py
Normal file
@@ -0,0 +1,130 @@
|
||||
"""Base similarity calculation framework."""
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Any
|
||||
|
||||
from ..config.schemas import SimilarityAlgorithmConfig
|
||||
from ..core.base import CodeBlock
|
||||
|
||||
|
||||
class BaseSimilarityAlgorithm(ABC):
|
||||
"""Base class for similarity algorithms."""
|
||||
|
||||
def __init__(self, config: SimilarityAlgorithmConfig | None = None):
|
||||
self.config = config or SimilarityAlgorithmConfig(
|
||||
name=self.__class__.__name__.lower()
|
||||
)
|
||||
|
||||
@abstractmethod
|
||||
def calculate(self, text1: str, text2: str) -> float:
|
||||
"""Calculate similarity between two text strings."""
|
||||
...
|
||||
|
||||
@property
|
||||
def name(self) -> str:
|
||||
"""Get algorithm name."""
|
||||
return self.config.name
|
||||
|
||||
@property
|
||||
def weight(self) -> float:
|
||||
"""Get algorithm weight."""
|
||||
return self.config.weight
|
||||
|
||||
@property
|
||||
def enabled(self) -> bool:
|
||||
"""Check if algorithm is enabled."""
|
||||
return self.config.enabled
|
||||
|
||||
|
||||
class SimilarityCalculator:
|
||||
"""Main similarity calculator that combines multiple algorithms."""
|
||||
|
||||
def __init__(self, algorithms: list[BaseSimilarityAlgorithm] | None = None):
|
||||
self.algorithms = algorithms or []
|
||||
self._normalize_weights()
|
||||
|
||||
def add_algorithm(self, algorithm: BaseSimilarityAlgorithm) -> None:
|
||||
"""Add a similarity algorithm."""
|
||||
self.algorithms.append(algorithm)
|
||||
self._normalize_weights()
|
||||
|
||||
def calculate_similarity(self, block1: CodeBlock, block2: CodeBlock) -> float:
|
||||
"""Calculate weighted similarity between two code blocks."""
|
||||
if not self.algorithms:
|
||||
return 0.0
|
||||
|
||||
total_score = 0.0
|
||||
total_weight = 0.0
|
||||
|
||||
for algorithm in self.algorithms:
|
||||
if not algorithm.enabled:
|
||||
continue
|
||||
|
||||
try:
|
||||
score = algorithm.calculate(
|
||||
block1.normalized_content, block2.normalized_content
|
||||
)
|
||||
total_score += score * algorithm.weight
|
||||
total_weight += algorithm.weight
|
||||
except Exception:
|
||||
# Skip algorithm if it fails
|
||||
continue
|
||||
|
||||
return total_score / total_weight if total_weight > 0 else 0.0
|
||||
|
||||
def calculate_detailed_similarity(
|
||||
self, block1: CodeBlock, block2: CodeBlock
|
||||
) -> dict[str, float]:
|
||||
"""Calculate similarity with breakdown by algorithm."""
|
||||
results = {}
|
||||
|
||||
for algorithm in self.algorithms:
|
||||
if not algorithm.enabled:
|
||||
continue
|
||||
|
||||
try:
|
||||
score = algorithm.calculate(
|
||||
block1.normalized_content, block2.normalized_content
|
||||
)
|
||||
results[algorithm.name] = score
|
||||
except Exception:
|
||||
results[algorithm.name] = 0.0
|
||||
|
||||
# Calculate weighted average
|
||||
total_score = sum(
|
||||
results[alg.name] * alg.weight
|
||||
for alg in self.algorithms
|
||||
if alg.enabled and alg.name in results
|
||||
)
|
||||
total_weight = sum(alg.weight for alg in self.algorithms if alg.enabled)
|
||||
|
||||
results["weighted_average"] = (
|
||||
total_score / total_weight if total_weight > 0 else 0.0
|
||||
)
|
||||
|
||||
return results
|
||||
|
||||
def _normalize_weights(self) -> None:
|
||||
"""Normalize algorithm weights to sum to 1.0."""
|
||||
enabled_algorithms = [alg for alg in self.algorithms if alg.enabled]
|
||||
|
||||
if not enabled_algorithms:
|
||||
return
|
||||
|
||||
total_weight = sum(alg.weight for alg in enabled_algorithms)
|
||||
|
||||
if total_weight > 0:
|
||||
for algorithm in enabled_algorithms:
|
||||
algorithm.config.weight = algorithm.weight / total_weight
|
||||
|
||||
def get_algorithm_info(self) -> list[dict[str, Any]]:
|
||||
"""Get information about all algorithms."""
|
||||
return [
|
||||
{
|
||||
"name": alg.name,
|
||||
"weight": alg.weight,
|
||||
"enabled": alg.enabled,
|
||||
"class": alg.__class__.__name__,
|
||||
}
|
||||
for alg in self.algorithms
|
||||
]
|
||||
326
src/quality/similarity/lsh.py
Normal file
326
src/quality/similarity/lsh.py
Normal file
@@ -0,0 +1,326 @@
|
||||
"""LSH-based similarity for efficient large-scale duplicate detection."""
|
||||
|
||||
import hashlib
|
||||
from collections import defaultdict
|
||||
from typing import Any
|
||||
|
||||
try:
|
||||
from datasketch import MinHash, MinHashLSH
|
||||
|
||||
LSH_AVAILABLE = True
|
||||
except ImportError:
|
||||
LSH_AVAILABLE = False
|
||||
|
||||
from ..config.schemas import SimilarityAlgorithmConfig
|
||||
from ..core.base import CodeBlock
|
||||
from .base import BaseSimilarityAlgorithm
|
||||
|
||||
|
||||
class LSHSimilarity(BaseSimilarityAlgorithm):
|
||||
"""LSH-based similarity for efficient approximate matching."""
|
||||
|
||||
def __init__(self, config: SimilarityAlgorithmConfig | None = None):
|
||||
if config is None:
|
||||
config = SimilarityAlgorithmConfig(
|
||||
name="lsh",
|
||||
weight=0.1,
|
||||
parameters={"threshold": 0.8, "num_perm": 128, "bands": 16, "rows": 8},
|
||||
)
|
||||
super().__init__(config)
|
||||
|
||||
# LSH parameters
|
||||
self.threshold = self.config.parameters.get("threshold", 0.8)
|
||||
self.num_perm = self.config.parameters.get("num_perm", 128)
|
||||
self.bands = self.config.parameters.get("bands", 16)
|
||||
self.rows = self.config.parameters.get("rows", 8)
|
||||
|
||||
# Initialize LSH index
|
||||
self.lsh_index = None
|
||||
self.minhashes = {}
|
||||
|
||||
if LSH_AVAILABLE:
|
||||
self._initialize_lsh()
|
||||
|
||||
def _initialize_lsh(self) -> None:
|
||||
"""Initialize LSH index."""
|
||||
if LSH_AVAILABLE:
|
||||
self.lsh_index = MinHashLSH(
|
||||
threshold=self.threshold, num_perm=self.num_perm
|
||||
)
|
||||
|
||||
def calculate(self, text1: str, text2: str) -> float:
|
||||
"""Calculate similarity using MinHash."""
|
||||
if not LSH_AVAILABLE:
|
||||
# Fallback to simple text similarity
|
||||
return self._fallback_similarity(text1, text2)
|
||||
|
||||
if not text1 and not text2:
|
||||
return 1.0
|
||||
if not text1 or not text2:
|
||||
return 0.0
|
||||
|
||||
minhash1 = self._create_minhash(text1)
|
||||
minhash2 = self._create_minhash(text2)
|
||||
|
||||
return minhash1.jaccard(minhash2)
|
||||
|
||||
def _create_minhash(self, text: str) -> Any:
|
||||
"""Create MinHash for text."""
|
||||
if not LSH_AVAILABLE:
|
||||
return None
|
||||
|
||||
minhash = MinHash(num_perm=self.num_perm)
|
||||
|
||||
# Create shingles from text
|
||||
shingles = self._get_shingles(text)
|
||||
|
||||
for shingle in shingles:
|
||||
minhash.update(shingle.encode("utf-8"))
|
||||
|
||||
return minhash
|
||||
|
||||
def _get_shingles(self, text: str, k: int = 4) -> set[str]:
|
||||
"""Generate character k-shingles from text."""
|
||||
# Normalize text
|
||||
normalized = text.lower().replace(" ", "").replace("\n", "").replace("\t", "")
|
||||
|
||||
if len(normalized) < k:
|
||||
return {normalized}
|
||||
|
||||
return {normalized[i : i + k] for i in range(len(normalized) - k + 1)}
|
||||
|
||||
def _fallback_similarity(self, text1: str, text2: str) -> float:
|
||||
"""Fallback similarity when LSH is not available."""
|
||||
if not text1 and not text2:
|
||||
return 1.0
|
||||
if not text1 or not text2:
|
||||
return 0.0
|
||||
|
||||
# Simple Jaccard similarity on character 4-grams
|
||||
shingles1 = self._get_shingles(text1)
|
||||
shingles2 = self._get_shingles(text2)
|
||||
|
||||
if not shingles1 and not shingles2:
|
||||
return 1.0
|
||||
if not shingles1 or not shingles2:
|
||||
return 0.0
|
||||
|
||||
intersection = len(shingles1.intersection(shingles2))
|
||||
union = len(shingles1.union(shingles2))
|
||||
|
||||
return intersection / union if union > 0 else 0.0
|
||||
|
||||
|
||||
class LSHDuplicateDetector:
|
||||
"""High-performance duplicate detection using LSH."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
threshold: float = 0.8,
|
||||
num_perm: int = 128,
|
||||
bands: int = 16,
|
||||
rows: int = 8,
|
||||
):
|
||||
self.threshold = threshold
|
||||
self.num_perm = num_perm
|
||||
self.bands = bands
|
||||
self.rows = rows
|
||||
|
||||
self.lsh_index = None
|
||||
self.minhashes = {}
|
||||
self.code_blocks = {}
|
||||
|
||||
if LSH_AVAILABLE:
|
||||
self.lsh_index = MinHashLSH(threshold=threshold, num_perm=num_perm)
|
||||
|
||||
def add_code_block(self, block: CodeBlock) -> None:
|
||||
"""Add a code block to the LSH index."""
|
||||
if not LSH_AVAILABLE:
|
||||
return
|
||||
|
||||
block_id = self._get_block_id(block)
|
||||
minhash = self._create_minhash(block.normalized_content)
|
||||
|
||||
self.minhashes[block_id] = minhash
|
||||
self.code_blocks[block_id] = block
|
||||
|
||||
if self.lsh_index:
|
||||
self.lsh_index.insert(block_id, minhash)
|
||||
|
||||
def find_similar_blocks(self, block: CodeBlock) -> list[tuple[CodeBlock, float]]:
|
||||
"""Find similar blocks using LSH."""
|
||||
if not LSH_AVAILABLE or not self.lsh_index:
|
||||
return []
|
||||
|
||||
block_id = self._get_block_id(block)
|
||||
query_minhash = self._create_minhash(block.normalized_content)
|
||||
|
||||
# Get candidate similar blocks
|
||||
candidates = self.lsh_index.query(query_minhash)
|
||||
|
||||
similar_blocks = []
|
||||
for candidate_id in candidates:
|
||||
if candidate_id == block_id:
|
||||
continue
|
||||
|
||||
candidate_block = self.code_blocks.get(candidate_id)
|
||||
if candidate_block:
|
||||
# Calculate exact similarity
|
||||
similarity = query_minhash.jaccard(self.minhashes[candidate_id])
|
||||
if similarity >= self.threshold:
|
||||
similar_blocks.append((candidate_block, similarity))
|
||||
|
||||
# Sort by similarity descending
|
||||
similar_blocks.sort(key=lambda x: x[1], reverse=True)
|
||||
return similar_blocks
|
||||
|
||||
def find_all_duplicates(self) -> list[list[CodeBlock]]:
|
||||
"""Find all duplicate groups using LSH."""
|
||||
if not LSH_AVAILABLE or not self.lsh_index:
|
||||
return []
|
||||
|
||||
duplicate_groups = []
|
||||
processed = set()
|
||||
|
||||
for block_id, block in self.code_blocks.items():
|
||||
if block_id in processed:
|
||||
continue
|
||||
|
||||
similar_blocks = self.find_similar_blocks(block)
|
||||
|
||||
if similar_blocks:
|
||||
# Create group with original block and similar blocks
|
||||
group = [block]
|
||||
group.extend([similar_block for similar_block, _ in similar_blocks])
|
||||
|
||||
# Mark all blocks in group as processed
|
||||
processed.add(block_id)
|
||||
for similar_block, _ in similar_blocks:
|
||||
similar_id = self._get_block_id(similar_block)
|
||||
processed.add(similar_id)
|
||||
|
||||
duplicate_groups.append(group)
|
||||
|
||||
return duplicate_groups
|
||||
|
||||
def get_statistics(self) -> dict[str, Any]:
|
||||
"""Get LSH index statistics."""
|
||||
if not LSH_AVAILABLE or not self.lsh_index:
|
||||
return {"error": "LSH not available"}
|
||||
|
||||
return {
|
||||
"total_blocks": len(self.code_blocks),
|
||||
"threshold": self.threshold,
|
||||
"num_perm": self.num_perm,
|
||||
"lsh_available": LSH_AVAILABLE,
|
||||
"index_keys": len(self.lsh_index.keys)
|
||||
if hasattr(self.lsh_index, "keys")
|
||||
else 0,
|
||||
}
|
||||
|
||||
def _create_minhash(self, text: str) -> Any:
|
||||
"""Create MinHash for text."""
|
||||
if not LSH_AVAILABLE:
|
||||
return None
|
||||
|
||||
minhash = MinHash(num_perm=self.num_perm)
|
||||
|
||||
# Create token-based shingles
|
||||
shingles = self._get_token_shingles(text)
|
||||
|
||||
for shingle in shingles:
|
||||
minhash.update(shingle.encode("utf-8"))
|
||||
|
||||
return minhash
|
||||
|
||||
def _get_token_shingles(self, text: str, k: int = 3) -> set[str]:
|
||||
"""Generate token k-shingles from text."""
|
||||
import re
|
||||
|
||||
# Tokenize text
|
||||
tokens = re.findall(r"\w+", text.lower())
|
||||
|
||||
if len(tokens) < k:
|
||||
return {" ".join(tokens)}
|
||||
|
||||
return {" ".join(tokens[i : i + k]) for i in range(len(tokens) - k + 1)}
|
||||
|
||||
def _get_block_id(self, block: CodeBlock) -> str:
|
||||
"""Generate unique ID for code block."""
|
||||
content = f"{block.file_path}:{block.start_line}:{block.end_line}"
|
||||
return hashlib.md5(content.encode()).hexdigest()
|
||||
|
||||
|
||||
class BandingLSH:
|
||||
"""Custom LSH implementation with banding technique."""
|
||||
|
||||
def __init__(self, bands: int = 20, rows: int = 5, threshold: float = 0.8):
|
||||
self.bands = bands
|
||||
self.rows = rows
|
||||
self.threshold = threshold
|
||||
self.hash_tables: list[defaultdict[int, set[str]]] = [
|
||||
defaultdict(set) for _ in range(bands)
|
||||
]
|
||||
self.signatures: dict[str, list[int]] = {}
|
||||
|
||||
def add_signature(self, item_id: str, signature: list[int]) -> None:
|
||||
"""Add signature to LSH buckets."""
|
||||
if len(signature) != self.bands * self.rows:
|
||||
raise ValueError(
|
||||
f"Signature length {len(signature)} != {self.bands * self.rows}"
|
||||
)
|
||||
|
||||
self.signatures[item_id] = signature
|
||||
|
||||
# Hash each band
|
||||
for band_idx in range(self.bands):
|
||||
start = band_idx * self.rows
|
||||
end = start + self.rows
|
||||
band_signature = tuple(signature[start:end])
|
||||
|
||||
# Hash the band
|
||||
band_hash = hash(band_signature)
|
||||
self.hash_tables[band_idx][band_hash].add(item_id)
|
||||
|
||||
def find_candidates(self, query_id: str) -> set[str]:
|
||||
"""Find candidate similar items."""
|
||||
if query_id not in self.signatures:
|
||||
return set()
|
||||
|
||||
candidates = set()
|
||||
query_signature = self.signatures[query_id]
|
||||
|
||||
# Check each band
|
||||
for band_idx in range(self.bands):
|
||||
start = band_idx * self.rows
|
||||
end = start + self.rows
|
||||
band_signature = tuple(query_signature[start:end])
|
||||
|
||||
band_hash = hash(band_signature)
|
||||
candidates.update(self.hash_tables[band_idx][band_hash])
|
||||
|
||||
# Remove query item itself
|
||||
candidates.discard(query_id)
|
||||
return candidates
|
||||
|
||||
def estimate_jaccard(self, sig1: list[int], sig2: list[int]) -> float:
|
||||
"""Estimate Jaccard similarity from signatures."""
|
||||
if len(sig1) != len(sig2):
|
||||
return 0.0
|
||||
|
||||
matches = sum(1 for a, b in zip(sig1, sig2, strict=False) if a == b)
|
||||
return matches / len(sig1)
|
||||
|
||||
def get_statistics(self) -> dict[str, Any]:
|
||||
"""Get LSH statistics."""
|
||||
total_buckets = sum(len(table) for table in self.hash_tables)
|
||||
avg_bucket_size = total_buckets / self.bands if self.bands > 0 else 0
|
||||
|
||||
return {
|
||||
"bands": self.bands,
|
||||
"rows": self.rows,
|
||||
"total_items": len(self.signatures),
|
||||
"total_buckets": total_buckets,
|
||||
"avg_bucket_size": avg_bucket_size,
|
||||
"threshold": self.threshold,
|
||||
}
|
||||
398
src/quality/similarity/semantic.py
Normal file
398
src/quality/similarity/semantic.py
Normal file
@@ -0,0 +1,398 @@
|
||||
"""Semantic similarity algorithms for code analysis."""
|
||||
|
||||
import ast
|
||||
import hashlib
|
||||
import re
|
||||
from collections import Counter
|
||||
|
||||
from ..config.schemas import SimilarityAlgorithmConfig
|
||||
from .base import BaseSimilarityAlgorithm
|
||||
|
||||
|
||||
class SemanticSimilarity(BaseSimilarityAlgorithm):
|
||||
"""Semantic similarity algorithm based on normalized code patterns."""
|
||||
|
||||
def __init__(self, config: SimilarityAlgorithmConfig | None = None):
|
||||
if config is None:
|
||||
config = SimilarityAlgorithmConfig(name="semantic", weight=0.2)
|
||||
super().__init__(config)
|
||||
|
||||
def calculate(self, text1: str, text2: str) -> float:
|
||||
"""Calculate similarity based on semantic patterns."""
|
||||
if not text1 and not text2:
|
||||
return 1.0
|
||||
if not text1 or not text2:
|
||||
return 0.0
|
||||
|
||||
# Normalize both texts for semantic comparison
|
||||
normalized1 = self._normalize_code(text1)
|
||||
normalized2 = self._normalize_code(text2)
|
||||
|
||||
# Calculate multiple semantic similarities
|
||||
pattern_sim = self._pattern_similarity(normalized1, normalized2)
|
||||
concept_sim = self._concept_similarity(text1, text2)
|
||||
structure_sim = self._semantic_structure_similarity(text1, text2)
|
||||
|
||||
# Weighted combination
|
||||
return pattern_sim * 0.4 + concept_sim * 0.4 + structure_sim * 0.2
|
||||
|
||||
def _normalize_code(self, code: str) -> str:
|
||||
"""Normalize code for semantic comparison."""
|
||||
# Remove comments
|
||||
code = re.sub(r"#.*$", "", code, flags=re.MULTILINE)
|
||||
code = re.sub(r'""".*?"""', "", code, flags=re.DOTALL)
|
||||
code = re.sub(r"'''.*?'''", "", code, flags=re.DOTALL)
|
||||
|
||||
# Normalize whitespace
|
||||
code = re.sub(r"\s+", " ", code).strip()
|
||||
|
||||
# Normalize variable names to generic patterns
|
||||
code = re.sub(r"\b[a-z_][a-z0-9_]*\b", "VAR", code)
|
||||
|
||||
# Normalize string literals
|
||||
code = re.sub(r'"[^"]*"', "STR", code)
|
||||
code = re.sub(r"'[^']*'", "STR", code)
|
||||
|
||||
# Normalize numbers
|
||||
code = re.sub(r"\b\d+\.?\d*\b", "NUM", code)
|
||||
|
||||
return code
|
||||
|
||||
def _pattern_similarity(self, normalized1: str, normalized2: str) -> float:
|
||||
"""Compare normalized code patterns."""
|
||||
if not normalized1 and not normalized2:
|
||||
return 1.0
|
||||
if not normalized1 or not normalized2:
|
||||
return 0.0
|
||||
|
||||
import difflib
|
||||
|
||||
return difflib.SequenceMatcher(None, normalized1, normalized2).ratio()
|
||||
|
||||
def _concept_similarity(self, code1: str, code2: str) -> float:
|
||||
"""Compare conceptual similarity using keywords and operations."""
|
||||
concepts1 = self._extract_concepts(code1)
|
||||
concepts2 = self._extract_concepts(code2)
|
||||
|
||||
if not concepts1 and not concepts2:
|
||||
return 1.0
|
||||
if not concepts1 or not concepts2:
|
||||
return 0.0
|
||||
|
||||
# Calculate cosine similarity on concept frequencies
|
||||
all_concepts = set(concepts1.keys()) | set(concepts2.keys())
|
||||
|
||||
dot_product = sum(
|
||||
concepts1.get(concept, 0) * concepts2.get(concept, 0)
|
||||
for concept in all_concepts
|
||||
)
|
||||
magnitude1 = (
|
||||
sum(concepts1.get(concept, 0) ** 2 for concept in all_concepts) ** 0.5
|
||||
)
|
||||
magnitude2 = (
|
||||
sum(concepts2.get(concept, 0) ** 2 for concept in all_concepts) ** 0.5
|
||||
)
|
||||
|
||||
if magnitude1 == 0 or magnitude2 == 0:
|
||||
return 0.0
|
||||
|
||||
return dot_product / (magnitude1 * magnitude2)
|
||||
|
||||
def _extract_concepts(self, code: str) -> Counter[str]:
|
||||
"""Extract conceptual elements from code."""
|
||||
concepts = Counter()
|
||||
|
||||
# Python keywords and operations
|
||||
python_concepts = {
|
||||
"def",
|
||||
"class",
|
||||
"if",
|
||||
"else",
|
||||
"elif",
|
||||
"for",
|
||||
"while",
|
||||
"try",
|
||||
"except",
|
||||
"finally",
|
||||
"with",
|
||||
"return",
|
||||
"yield",
|
||||
"import",
|
||||
"from",
|
||||
"as",
|
||||
"and",
|
||||
"or",
|
||||
"not",
|
||||
"in",
|
||||
"is",
|
||||
"lambda",
|
||||
"pass",
|
||||
"break",
|
||||
"continue",
|
||||
}
|
||||
|
||||
# Extract words
|
||||
words = re.findall(r"\b\w+\b", code.lower())
|
||||
|
||||
for word in words:
|
||||
if word in python_concepts:
|
||||
concepts[f"keyword:{word}"] += 1
|
||||
elif word in ["len", "str", "int", "float", "list", "dict", "set", "tuple"]:
|
||||
concepts[f"builtin:{word}"] += 1
|
||||
elif word.endswith("error") or word.endswith("exception"):
|
||||
concepts["error_handling"] += 1
|
||||
elif word in ["print", "log", "debug", "info", "warn", "error"]:
|
||||
concepts["logging"] += 1
|
||||
elif word in ["open", "read", "write", "close", "file"]:
|
||||
concepts["file_io"] += 1
|
||||
elif word in ["get", "post", "put", "delete", "request", "response"]:
|
||||
concepts["http"] += 1
|
||||
elif word in ["query", "select", "insert", "update", "delete", "database"]:
|
||||
concepts["database"] += 1
|
||||
|
||||
# Extract operators and patterns
|
||||
operators = re.findall(r"[+\-*/=<>!&|^~%]", code)
|
||||
for op in operators:
|
||||
concepts[f"operator:{op}"] += 1
|
||||
|
||||
return concepts
|
||||
|
||||
def _semantic_structure_similarity(self, code1: str, code2: str) -> float:
|
||||
"""Compare semantic structure patterns."""
|
||||
try:
|
||||
import ast
|
||||
|
||||
tree1 = ast.parse(code1)
|
||||
tree2 = ast.parse(code2)
|
||||
|
||||
patterns1 = self._extract_semantic_patterns(tree1)
|
||||
patterns2 = self._extract_semantic_patterns(tree2)
|
||||
|
||||
return self._compare_pattern_sets(patterns1, patterns2)
|
||||
|
||||
except SyntaxError:
|
||||
return 0.0
|
||||
|
||||
def _extract_semantic_patterns(self, tree: ast.AST) -> set[str]:
|
||||
"""Extract semantic patterns from AST."""
|
||||
patterns = set()
|
||||
|
||||
for node in ast.walk(tree):
|
||||
if isinstance(node, ast.FunctionDef):
|
||||
# Function signature patterns
|
||||
arg_count = len(node.args.args)
|
||||
patterns.add(f"function_args:{arg_count}")
|
||||
|
||||
# Check for common patterns
|
||||
if any(isinstance(n, ast.Return) for n in ast.walk(node)):
|
||||
patterns.add("function_returns")
|
||||
if any(isinstance(n, ast.Yield) for n in ast.walk(node)):
|
||||
patterns.add("generator_function")
|
||||
|
||||
elif isinstance(node, ast.ClassDef):
|
||||
# Class patterns
|
||||
base_count = len(node.bases)
|
||||
patterns.add(f"class_inheritance:{base_count}")
|
||||
|
||||
elif isinstance(node, ast.Try):
|
||||
# Exception handling patterns
|
||||
patterns.add("exception_handling")
|
||||
if node.finalbody:
|
||||
patterns.add("finally_block")
|
||||
|
||||
elif isinstance(node, ast.With):
|
||||
# Context manager pattern
|
||||
patterns.add("context_manager")
|
||||
|
||||
elif isinstance(node, ast.ListComp):
|
||||
patterns.add("list_comprehension")
|
||||
elif isinstance(node, ast.DictComp):
|
||||
patterns.add("dict_comprehension")
|
||||
elif isinstance(node, ast.SetComp):
|
||||
patterns.add("set_comprehension")
|
||||
|
||||
elif isinstance(node, ast.Lambda):
|
||||
patterns.add("lambda_function")
|
||||
|
||||
elif isinstance(
|
||||
node, (ast.FunctionDef, ast.ClassDef, ast.AsyncFunctionDef)
|
||||
):
|
||||
if node.decorator_list:
|
||||
patterns.add("decorator_usage")
|
||||
|
||||
return patterns
|
||||
|
||||
def _compare_pattern_sets(self, patterns1: set[str], patterns2: set[str]) -> float:
|
||||
"""Compare two sets of semantic patterns."""
|
||||
if not patterns1 and not patterns2:
|
||||
return 1.0
|
||||
if not patterns1 or not patterns2:
|
||||
return 0.0
|
||||
|
||||
intersection = len(patterns1.intersection(patterns2))
|
||||
union = len(patterns1.union(patterns2))
|
||||
|
||||
return intersection / union if union > 0 else 0.0
|
||||
|
||||
|
||||
class FunctionalSimilarity(BaseSimilarityAlgorithm):
|
||||
"""Similarity based on functional behavior patterns."""
|
||||
|
||||
def __init__(self, config: SimilarityAlgorithmConfig | None = None):
|
||||
if config is None:
|
||||
config = SimilarityAlgorithmConfig(name="functional", weight=0.15)
|
||||
super().__init__(config)
|
||||
|
||||
def calculate(self, text1: str, text2: str) -> float:
|
||||
"""Calculate similarity based on functional patterns."""
|
||||
if not text1 and not text2:
|
||||
return 1.0
|
||||
if not text1 or not text2:
|
||||
return 0.0
|
||||
|
||||
try:
|
||||
import ast
|
||||
|
||||
tree1 = ast.parse(text1)
|
||||
tree2 = ast.parse(text2)
|
||||
|
||||
behavior1 = self._extract_behavioral_patterns(tree1)
|
||||
behavior2 = self._extract_behavioral_patterns(tree2)
|
||||
|
||||
return self._compare_behaviors(behavior1, behavior2)
|
||||
|
||||
except SyntaxError:
|
||||
return 0.0
|
||||
|
||||
def _extract_behavioral_patterns(self, tree: ast.AST) -> dict[str, int]:
|
||||
"""Extract behavioral patterns from AST."""
|
||||
|
||||
patterns = {
|
||||
"data_access": 0, # Reading/accessing data
|
||||
"data_mutation": 0, # Modifying data
|
||||
"control_flow": 0, # Conditional logic
|
||||
"iteration": 0, # Loops and iteration
|
||||
"function_calls": 0, # Function invocations
|
||||
"exception_handling": 0, # Error handling
|
||||
"io_operations": 0, # Input/output
|
||||
"mathematical": 0, # Math operations
|
||||
}
|
||||
|
||||
for node in ast.walk(tree):
|
||||
if isinstance(node, (ast.Subscript, ast.Attribute)):
|
||||
patterns["data_access"] += 1
|
||||
elif isinstance(node, (ast.Assign, ast.AugAssign)):
|
||||
patterns["data_mutation"] += 1
|
||||
elif isinstance(node, ast.If):
|
||||
patterns["control_flow"] += 1
|
||||
elif isinstance(node, (ast.For, ast.While)):
|
||||
patterns["iteration"] += 1
|
||||
elif isinstance(node, ast.Call):
|
||||
patterns["function_calls"] += 1
|
||||
# Check for specific types of calls
|
||||
if isinstance(node.func, ast.Name):
|
||||
func_name = node.func.id.lower()
|
||||
if func_name in ["print", "input", "open", "read", "write"]:
|
||||
patterns["io_operations"] += 1
|
||||
elif isinstance(node, ast.Try):
|
||||
patterns["exception_handling"] += 1
|
||||
elif isinstance(node, ast.BinOp) and isinstance(
|
||||
node.op, (ast.Add, ast.Sub, ast.Mult, ast.Div, ast.Mod, ast.Pow)
|
||||
):
|
||||
patterns["mathematical"] += 1
|
||||
|
||||
return patterns
|
||||
|
||||
def _compare_behaviors(
|
||||
self, behavior1: dict[str, int], behavior2: dict[str, int]
|
||||
) -> float:
|
||||
"""Compare behavioral patterns."""
|
||||
if not any(behavior1.values()) and not any(behavior2.values()):
|
||||
return 1.0
|
||||
if not any(behavior1.values()) or not any(behavior2.values()):
|
||||
return 0.0
|
||||
|
||||
# Calculate cosine similarity on behavior patterns
|
||||
all_patterns = set(behavior1.keys()) | set(behavior2.keys())
|
||||
|
||||
dot_product = sum(
|
||||
behavior1.get(pattern, 0) * behavior2.get(pattern, 0)
|
||||
for pattern in all_patterns
|
||||
)
|
||||
magnitude1 = (
|
||||
sum(behavior1.get(pattern, 0) ** 2 for pattern in all_patterns) ** 0.5
|
||||
)
|
||||
magnitude2 = (
|
||||
sum(behavior2.get(pattern, 0) ** 2 for pattern in all_patterns) ** 0.5
|
||||
)
|
||||
|
||||
if magnitude1 == 0 or magnitude2 == 0:
|
||||
return 0.0
|
||||
|
||||
return dot_product / (magnitude1 * magnitude2)
|
||||
|
||||
|
||||
class HashSimilarity(BaseSimilarityAlgorithm):
|
||||
"""Similarity based on code content hashing."""
|
||||
|
||||
def __init__(self, config: SimilarityAlgorithmConfig | None = None):
|
||||
if config is None:
|
||||
config = SimilarityAlgorithmConfig(name="hash", weight=0.1)
|
||||
super().__init__(config)
|
||||
|
||||
def calculate(self, text1: str, text2: str) -> float:
|
||||
"""Calculate similarity using various hash comparisons."""
|
||||
if not text1 and not text2:
|
||||
return 1.0
|
||||
if not text1 or not text2:
|
||||
return 0.0
|
||||
|
||||
# Multiple hash-based comparisons
|
||||
exact_match = self._exact_hash_similarity(text1, text2)
|
||||
if exact_match == 1.0:
|
||||
return 1.0
|
||||
|
||||
normalized_match = self._normalized_hash_similarity(text1, text2)
|
||||
fuzzy_match = self._fuzzy_hash_similarity(text1, text2)
|
||||
|
||||
return max(exact_match, normalized_match, fuzzy_match)
|
||||
|
||||
def _exact_hash_similarity(self, text1: str, text2: str) -> float:
|
||||
"""Check for exact content match."""
|
||||
hash1 = hashlib.md5(text1.encode()).hexdigest()
|
||||
hash2 = hashlib.md5(text2.encode()).hexdigest()
|
||||
return 1.0 if hash1 == hash2 else 0.0
|
||||
|
||||
def _normalized_hash_similarity(self, text1: str, text2: str) -> float:
|
||||
"""Check for normalized content match."""
|
||||
# Normalize whitespace and comments
|
||||
normalized1 = re.sub(
|
||||
r"\s+", " ", re.sub(r"#.*$", "", text1, flags=re.MULTILINE)
|
||||
).strip()
|
||||
normalized2 = re.sub(
|
||||
r"\s+", " ", re.sub(r"#.*$", "", text2, flags=re.MULTILINE)
|
||||
).strip()
|
||||
|
||||
hash1 = hashlib.md5(normalized1.encode()).hexdigest()
|
||||
hash2 = hashlib.md5(normalized2.encode()).hexdigest()
|
||||
|
||||
return 1.0 if hash1 == hash2 else 0.0
|
||||
|
||||
def _fuzzy_hash_similarity(self, text1: str, text2: str) -> float:
|
||||
"""Calculate fuzzy hash similarity using character n-grams."""
|
||||
# Create character 4-grams for fuzzy matching
|
||||
ngrams1 = set(text1[i : i + 4] for i in range(len(text1) - 3))
|
||||
ngrams2 = set(text2[i : i + 4] for i in range(len(text2) - 3))
|
||||
|
||||
if not ngrams1 and not ngrams2:
|
||||
return 1.0
|
||||
if not ngrams1 or not ngrams2:
|
||||
return 0.0
|
||||
|
||||
intersection = len(ngrams1.intersection(ngrams2))
|
||||
union = len(ngrams1.union(ngrams2))
|
||||
|
||||
jaccard = intersection / union if union > 0 else 0.0
|
||||
|
||||
# Return 1.0 only for very high similarity
|
||||
return 1.0 if jaccard > 0.95 else 0.0
|
||||
399
src/quality/similarity/structural.py
Normal file
399
src/quality/similarity/structural.py
Normal file
@@ -0,0 +1,399 @@
|
||||
"""Structural similarity algorithms for code analysis."""
|
||||
|
||||
import ast
|
||||
from collections import Counter
|
||||
|
||||
from ..config.schemas import SimilarityAlgorithmConfig
|
||||
from .base import BaseSimilarityAlgorithm
|
||||
|
||||
|
||||
class StructuralSimilarity(BaseSimilarityAlgorithm):
|
||||
"""AST-based structural similarity algorithm."""
|
||||
|
||||
def __init__(self, config: SimilarityAlgorithmConfig | None = None):
|
||||
if config is None:
|
||||
config = SimilarityAlgorithmConfig(name="structural", weight=0.25)
|
||||
super().__init__(config)
|
||||
|
||||
def calculate(self, text1: str, text2: str) -> float:
|
||||
"""Calculate similarity based on AST structure."""
|
||||
if not text1 and not text2:
|
||||
return 1.0
|
||||
if not text1 or not text2:
|
||||
return 0.0
|
||||
|
||||
try:
|
||||
tree1 = ast.parse(text1)
|
||||
tree2 = ast.parse(text2)
|
||||
except SyntaxError:
|
||||
# Fallback to text-based comparison for malformed code
|
||||
return self._fallback_similarity(text1, text2)
|
||||
|
||||
structure1 = self._extract_structure(tree1)
|
||||
structure2 = self._extract_structure(tree2)
|
||||
|
||||
return self._compare_structures(structure1, structure2)
|
||||
|
||||
def _extract_structure(self, tree: ast.AST) -> list[str]:
|
||||
"""Extract enhanced structural patterns from AST."""
|
||||
structure = []
|
||||
|
||||
# Track nesting depth for better structural comparison
|
||||
def visit_node(node: ast.AST, depth: int = 0) -> None:
|
||||
depth_prefix = f"d{depth}:" if depth > 0 else ""
|
||||
|
||||
if isinstance(node, ast.FunctionDef):
|
||||
# Abstract function names but keep structural information
|
||||
arg_count = len(node.args.args)
|
||||
has_decorators = len(node.decorator_list) > 0
|
||||
structure.append(f"{depth_prefix}function:args{arg_count}:dec{has_decorators}")
|
||||
|
||||
# Analyze function body patterns
|
||||
body_patterns = []
|
||||
for child in node.body:
|
||||
if isinstance(child, ast.If):
|
||||
body_patterns.append("if")
|
||||
elif isinstance(child, ast.For):
|
||||
body_patterns.append("for")
|
||||
elif isinstance(child, ast.While):
|
||||
body_patterns.append("while")
|
||||
elif isinstance(child, ast.Try):
|
||||
body_patterns.append("try")
|
||||
elif isinstance(child, ast.Return):
|
||||
body_patterns.append("return")
|
||||
|
||||
if body_patterns:
|
||||
structure.append(f"{depth_prefix}body_pattern:{'_'.join(body_patterns[:5])}")
|
||||
|
||||
# Visit children with increased depth
|
||||
for child in ast.iter_child_nodes(node):
|
||||
visit_node(child, depth + 1)
|
||||
|
||||
elif isinstance(node, ast.AsyncFunctionDef):
|
||||
arg_count = len(node.args.args)
|
||||
has_decorators = len(node.decorator_list) > 0
|
||||
structure.append(f"{depth_prefix}async_function:args{arg_count}:dec{has_decorators}")
|
||||
|
||||
for child in ast.iter_child_nodes(node):
|
||||
visit_node(child, depth + 1)
|
||||
|
||||
elif isinstance(node, ast.ClassDef):
|
||||
# Abstract class names but keep inheritance and structure info
|
||||
base_count = len(node.bases)
|
||||
has_decorators = len(node.decorator_list) > 0
|
||||
structure.append(f"{depth_prefix}class:bases{base_count}:dec{has_decorators}")
|
||||
|
||||
# Count methods in class
|
||||
method_count = sum(1 for child in node.body if isinstance(child, (ast.FunctionDef, ast.AsyncFunctionDef)))
|
||||
structure.append(f"{depth_prefix}class_methods:{method_count}")
|
||||
|
||||
for child in ast.iter_child_nodes(node):
|
||||
visit_node(child, depth + 1)
|
||||
|
||||
elif isinstance(node, ast.If):
|
||||
# Track conditional structure complexity
|
||||
elif_count = len([n for n in node.orelse if isinstance(n, ast.If)])
|
||||
has_else = any(not isinstance(n, ast.If) for n in node.orelse)
|
||||
structure.append(f"{depth_prefix}if:elif{elif_count}:else{has_else}")
|
||||
|
||||
for child in ast.iter_child_nodes(node):
|
||||
visit_node(child, depth + 1)
|
||||
|
||||
elif isinstance(node, ast.For):
|
||||
# Detect nested loops
|
||||
is_nested = any(isinstance(child, (ast.For, ast.While)) for child in ast.walk(node))
|
||||
structure.append(f"{depth_prefix}for:nested{is_nested}")
|
||||
|
||||
for child in ast.iter_child_nodes(node):
|
||||
visit_node(child, depth + 1)
|
||||
|
||||
elif isinstance(node, ast.While):
|
||||
is_nested = any(isinstance(child, (ast.For, ast.While)) for child in ast.walk(node))
|
||||
structure.append(f"{depth_prefix}while:nested{is_nested}")
|
||||
|
||||
for child in ast.iter_child_nodes(node):
|
||||
visit_node(child, depth + 1)
|
||||
|
||||
elif isinstance(node, ast.Try):
|
||||
except_count = len(node.handlers)
|
||||
has_finally = bool(node.finalbody)
|
||||
has_else = bool(node.orelse)
|
||||
structure.append(f"{depth_prefix}try:except{except_count}:finally{has_finally}:else{has_else}")
|
||||
|
||||
for child in ast.iter_child_nodes(node):
|
||||
visit_node(child, depth + 1)
|
||||
|
||||
elif isinstance(node, ast.With):
|
||||
item_count = len(node.items)
|
||||
structure.append(f"{depth_prefix}with:items{item_count}")
|
||||
|
||||
for child in ast.iter_child_nodes(node):
|
||||
visit_node(child, depth + 1)
|
||||
|
||||
elif isinstance(node, ast.Return):
|
||||
has_value = node.value is not None
|
||||
structure.append(f"{depth_prefix}return:value{has_value}")
|
||||
|
||||
elif isinstance(node, ast.Assign):
|
||||
target_count = len(node.targets)
|
||||
structure.append(f"{depth_prefix}assign:targets{target_count}")
|
||||
|
||||
elif isinstance(node, ast.Call):
|
||||
# Abstract function calls but keep argument structure
|
||||
arg_count = len(node.args)
|
||||
kwarg_count = len(node.keywords)
|
||||
structure.append(f"{depth_prefix}call:args{arg_count}:kwargs{kwarg_count}")
|
||||
|
||||
else:
|
||||
# Visit other node types without adding to structure
|
||||
for child in ast.iter_child_nodes(node):
|
||||
visit_node(child, depth)
|
||||
|
||||
visit_node(tree)
|
||||
return structure
|
||||
|
||||
def _compare_structures(
|
||||
self, structure1: list[str], structure2: list[str]
|
||||
) -> float:
|
||||
"""Compare two structural patterns."""
|
||||
if not structure1 and not structure2:
|
||||
return 1.0
|
||||
if not structure1 or not structure2:
|
||||
return 0.0
|
||||
|
||||
# Convert to sets for Jaccard similarity on structure
|
||||
set1 = set(structure1)
|
||||
set2 = set(structure2)
|
||||
|
||||
intersection = len(set1.intersection(set2))
|
||||
union = len(set1.union(set2))
|
||||
|
||||
jaccard = intersection / union if union > 0 else 0.0
|
||||
|
||||
# Also consider sequence similarity
|
||||
sequence_sim = self._sequence_similarity(structure1, structure2)
|
||||
|
||||
# Combine Jaccard and sequence similarity
|
||||
return (jaccard + sequence_sim) / 2
|
||||
|
||||
def _sequence_similarity(self, seq1: list[str], seq2: list[str]) -> float:
|
||||
"""Calculate similarity preserving sequence order."""
|
||||
if not seq1 and not seq2:
|
||||
return 1.0
|
||||
if not seq1 or not seq2:
|
||||
return 0.0
|
||||
|
||||
# Use dynamic programming for longest common subsequence
|
||||
lcs_length = self._lcs_length(seq1, seq2)
|
||||
max_length = max(len(seq1), len(seq2))
|
||||
|
||||
return lcs_length / max_length if max_length > 0 else 0.0
|
||||
|
||||
def _lcs_length(self, seq1: list[str], seq2: list[str]) -> int:
|
||||
"""Calculate length of longest common subsequence."""
|
||||
m, n = len(seq1), len(seq2)
|
||||
|
||||
# Create DP table
|
||||
dp = [[0] * (n + 1) for _ in range(m + 1)]
|
||||
|
||||
# Fill the DP table
|
||||
for i in range(1, m + 1):
|
||||
for j in range(1, n + 1):
|
||||
if seq1[i - 1] == seq2[j - 1]:
|
||||
dp[i][j] = dp[i - 1][j - 1] + 1
|
||||
else:
|
||||
dp[i][j] = max(dp[i - 1][j], dp[i][j - 1])
|
||||
|
||||
return dp[m][n]
|
||||
|
||||
def _fallback_similarity(self, text1: str, text2: str) -> float:
|
||||
"""Fallback to simple text similarity for malformed code."""
|
||||
import difflib
|
||||
|
||||
return difflib.SequenceMatcher(None, text1, text2).ratio()
|
||||
|
||||
|
||||
class TreeEditDistance(BaseSimilarityAlgorithm):
|
||||
"""Tree edit distance-based similarity algorithm."""
|
||||
|
||||
def __init__(self, config: SimilarityAlgorithmConfig | None = None):
|
||||
if config is None:
|
||||
config = SimilarityAlgorithmConfig(name="tree_edit", weight=0.2)
|
||||
super().__init__(config)
|
||||
|
||||
def calculate(self, text1: str, text2: str) -> float:
|
||||
"""Calculate similarity using simplified tree edit distance."""
|
||||
if not text1 and not text2:
|
||||
return 1.0
|
||||
if not text1 or not text2:
|
||||
return 0.0
|
||||
|
||||
try:
|
||||
tree1 = ast.parse(text1)
|
||||
tree2 = ast.parse(text2)
|
||||
except SyntaxError:
|
||||
# Fallback to text-based comparison
|
||||
import difflib
|
||||
|
||||
return difflib.SequenceMatcher(None, text1, text2).ratio()
|
||||
|
||||
# Simplified tree representation
|
||||
nodes1 = self._get_node_types(tree1)
|
||||
nodes2 = self._get_node_types(tree2)
|
||||
|
||||
# Calculate edit distance
|
||||
edit_distance = self._edit_distance(nodes1, nodes2)
|
||||
max_length = max(len(nodes1), len(nodes2))
|
||||
|
||||
# Convert to similarity score
|
||||
return 1 - (edit_distance / max_length) if max_length > 0 else 1.0
|
||||
|
||||
def _get_node_types(self, tree: ast.AST) -> list[str]:
|
||||
"""Extract node types from AST."""
|
||||
return [type(node).__name__ for node in ast.walk(tree)]
|
||||
|
||||
def _edit_distance(self, seq1: list[str], seq2: list[str]) -> int:
|
||||
"""Calculate edit distance between two sequences."""
|
||||
m, n = len(seq1), len(seq2)
|
||||
|
||||
# Create DP table
|
||||
dp = [[0] * (n + 1) for _ in range(m + 1)]
|
||||
|
||||
# Initialize base cases
|
||||
for i in range(m + 1):
|
||||
dp[i][0] = i
|
||||
for j in range(n + 1):
|
||||
dp[0][j] = j
|
||||
|
||||
# Fill the DP table
|
||||
for i in range(1, m + 1):
|
||||
for j in range(1, n + 1):
|
||||
if seq1[i - 1] == seq2[j - 1]:
|
||||
dp[i][j] = dp[i - 1][j - 1]
|
||||
else:
|
||||
dp[i][j] = 1 + min(
|
||||
dp[i - 1][j], # deletion
|
||||
dp[i][j - 1], # insertion
|
||||
dp[i - 1][j - 1], # substitution
|
||||
)
|
||||
|
||||
return dp[m][n]
|
||||
|
||||
|
||||
class DependencySimilarity(BaseSimilarityAlgorithm):
|
||||
"""Import and dependency-based similarity algorithm."""
|
||||
|
||||
def __init__(self, config: SimilarityAlgorithmConfig | None = None):
|
||||
if config is None:
|
||||
config = SimilarityAlgorithmConfig(name="dependency", weight=0.15)
|
||||
super().__init__(config)
|
||||
|
||||
def calculate(self, text1: str, text2: str) -> float:
|
||||
"""Calculate similarity based on imports and dependencies."""
|
||||
if not text1 and not text2:
|
||||
return 1.0
|
||||
if not text1 or not text2:
|
||||
return 0.0
|
||||
|
||||
try:
|
||||
deps1 = self._extract_dependencies(text1)
|
||||
deps2 = self._extract_dependencies(text2)
|
||||
except SyntaxError:
|
||||
return 0.0
|
||||
|
||||
return self._compare_dependencies(deps1, deps2)
|
||||
|
||||
def _extract_dependencies(self, code: str) -> set[str]:
|
||||
"""Extract import dependencies from code."""
|
||||
try:
|
||||
tree = ast.parse(code)
|
||||
except SyntaxError:
|
||||
return set()
|
||||
|
||||
dependencies = set()
|
||||
|
||||
for node in ast.walk(tree):
|
||||
if isinstance(node, ast.Import):
|
||||
for alias in node.names:
|
||||
dependencies.add(alias.name.split(".")[0])
|
||||
elif isinstance(node, ast.ImportFrom):
|
||||
if node.module:
|
||||
dependencies.add(node.module.split(".")[0])
|
||||
for alias in node.names:
|
||||
dependencies.add(alias.name)
|
||||
|
||||
return dependencies
|
||||
|
||||
def _compare_dependencies(self, deps1: set[str], deps2: set[str]) -> float:
|
||||
"""Compare two sets of dependencies."""
|
||||
if not deps1 and not deps2:
|
||||
return 1.0
|
||||
if not deps1 or not deps2:
|
||||
return 0.0
|
||||
|
||||
intersection = len(deps1.intersection(deps2))
|
||||
union = len(deps1.union(deps2))
|
||||
|
||||
return intersection / union if union > 0 else 0.0
|
||||
|
||||
|
||||
class IdentifierSimilarity(BaseSimilarityAlgorithm):
|
||||
"""Variable and function name-based similarity algorithm."""
|
||||
|
||||
def __init__(self, config: SimilarityAlgorithmConfig | None = None):
|
||||
if config is None:
|
||||
config = SimilarityAlgorithmConfig(name="identifier", weight=0.2)
|
||||
super().__init__(config)
|
||||
|
||||
def calculate(self, text1: str, text2: str) -> float:
|
||||
"""Calculate similarity based on identifier names."""
|
||||
if not text1 and not text2:
|
||||
return 1.0
|
||||
if not text1 or not text2:
|
||||
return 0.0
|
||||
|
||||
try:
|
||||
identifiers1 = self._extract_identifiers(text1)
|
||||
identifiers2 = self._extract_identifiers(text2)
|
||||
except SyntaxError:
|
||||
return 0.0
|
||||
|
||||
return self._compare_identifiers(identifiers1, identifiers2)
|
||||
|
||||
def _extract_identifiers(self, code: str) -> Counter[str]:
|
||||
"""Extract all identifiers from code."""
|
||||
try:
|
||||
tree = ast.parse(code)
|
||||
except SyntaxError:
|
||||
return Counter()
|
||||
|
||||
identifiers = []
|
||||
|
||||
for node in ast.walk(tree):
|
||||
if isinstance(node, ast.Name):
|
||||
identifiers.append(node.id)
|
||||
elif isinstance(node, ast.FunctionDef | ast.ClassDef):
|
||||
identifiers.append(node.name)
|
||||
elif isinstance(node, ast.Attribute):
|
||||
identifiers.append(node.attr)
|
||||
|
||||
return Counter(identifiers)
|
||||
|
||||
def _compare_identifiers(self, ids1: Counter[str], ids2: Counter[str]) -> float:
|
||||
"""Compare two sets of identifiers."""
|
||||
if not ids1 and not ids2:
|
||||
return 1.0
|
||||
if not ids1 or not ids2:
|
||||
return 0.0
|
||||
|
||||
# Calculate cosine similarity on identifier frequencies
|
||||
all_ids = set(ids1.keys()) | set(ids2.keys())
|
||||
|
||||
dot_product = sum(ids1[id_] * ids2[id_] for id_ in all_ids)
|
||||
magnitude1 = sum(ids1[id_] ** 2 for id_ in all_ids) ** 0.5
|
||||
magnitude2 = sum(ids2[id_] ** 2 for id_ in all_ids) ** 0.5
|
||||
|
||||
if magnitude1 == 0 or magnitude2 == 0:
|
||||
return 0.0
|
||||
|
||||
return dot_product / (magnitude1 * magnitude2)
|
||||
131
src/quality/similarity/text_based.py
Normal file
131
src/quality/similarity/text_based.py
Normal file
@@ -0,0 +1,131 @@
|
||||
"""Text-based similarity algorithms."""
|
||||
|
||||
import difflib
|
||||
|
||||
try:
|
||||
from Levenshtein import ratio as levenshtein_ratio
|
||||
|
||||
LEVENSHTEIN_AVAILABLE = True
|
||||
except ImportError:
|
||||
LEVENSHTEIN_AVAILABLE = False
|
||||
|
||||
from ..config.schemas import SimilarityAlgorithmConfig
|
||||
from .base import BaseSimilarityAlgorithm
|
||||
|
||||
|
||||
class LevenshteinSimilarity(BaseSimilarityAlgorithm):
|
||||
"""Levenshtein distance-based similarity algorithm."""
|
||||
|
||||
def __init__(self, config: SimilarityAlgorithmConfig | None = None):
|
||||
if config is None:
|
||||
config = SimilarityAlgorithmConfig(name="levenshtein", weight=0.2)
|
||||
super().__init__(config)
|
||||
|
||||
def calculate(self, text1: str, text2: str) -> float:
|
||||
"""Calculate similarity using Levenshtein distance."""
|
||||
if not text1 and not text2:
|
||||
return 1.0
|
||||
if not text1 or not text2:
|
||||
return 0.0
|
||||
|
||||
if LEVENSHTEIN_AVAILABLE:
|
||||
return levenshtein_ratio(text1, text2)
|
||||
else:
|
||||
# Fallback to difflib implementation
|
||||
return difflib.SequenceMatcher(None, text1, text2).ratio()
|
||||
|
||||
|
||||
class DifflibSimilarity(BaseSimilarityAlgorithm):
|
||||
"""Python difflib-based similarity algorithm."""
|
||||
|
||||
def __init__(self, config: SimilarityAlgorithmConfig | None = None):
|
||||
if config is None:
|
||||
config = SimilarityAlgorithmConfig(name="difflib", weight=0.25)
|
||||
super().__init__(config)
|
||||
|
||||
def calculate(self, text1: str, text2: str) -> float:
|
||||
"""Calculate similarity using difflib SequenceMatcher."""
|
||||
if not text1 and not text2:
|
||||
return 1.0
|
||||
if not text1 or not text2:
|
||||
return 0.0
|
||||
|
||||
return difflib.SequenceMatcher(None, text1, text2).ratio()
|
||||
|
||||
|
||||
class LongestCommonSubsequence(BaseSimilarityAlgorithm):
|
||||
"""Longest Common Subsequence-based similarity."""
|
||||
|
||||
def __init__(self, config: SimilarityAlgorithmConfig | None = None):
|
||||
if config is None:
|
||||
config = SimilarityAlgorithmConfig(name="lcs", weight=0.15)
|
||||
super().__init__(config)
|
||||
|
||||
def calculate(self, text1: str, text2: str) -> float:
|
||||
"""Calculate similarity using LCS."""
|
||||
if not text1 and not text2:
|
||||
return 1.0
|
||||
if not text1 or not text2:
|
||||
return 0.0
|
||||
|
||||
lcs_length = self._lcs_length(text1, text2)
|
||||
max_length = max(len(text1), len(text2))
|
||||
|
||||
return lcs_length / max_length if max_length > 0 else 0.0
|
||||
|
||||
def _lcs_length(self, text1: str, text2: str) -> int:
|
||||
"""Calculate length of longest common subsequence."""
|
||||
m, n = len(text1), len(text2)
|
||||
|
||||
# Create DP table
|
||||
dp = [[0] * (n + 1) for _ in range(m + 1)]
|
||||
|
||||
# Fill the DP table
|
||||
for i in range(1, m + 1):
|
||||
for j in range(1, n + 1):
|
||||
if text1[i - 1] == text2[j - 1]:
|
||||
dp[i][j] = dp[i - 1][j - 1] + 1
|
||||
else:
|
||||
dp[i][j] = max(dp[i - 1][j], dp[i][j - 1])
|
||||
|
||||
return dp[m][n]
|
||||
|
||||
|
||||
class NGramSimilarity(BaseSimilarityAlgorithm):
|
||||
"""N-gram based similarity algorithm."""
|
||||
|
||||
def __init__(self, config: SimilarityAlgorithmConfig | None = None):
|
||||
if config is None:
|
||||
config = SimilarityAlgorithmConfig(
|
||||
name="ngram", weight=0.2, parameters={"n": 3}
|
||||
)
|
||||
super().__init__(config)
|
||||
n_param = self.config.parameters.get("n", 3)
|
||||
self.n: int = int(n_param) if isinstance(n_param, (int, float, str)) else 3
|
||||
|
||||
def calculate(self, text1: str, text2: str) -> float:
|
||||
"""Calculate similarity using n-grams."""
|
||||
if not text1 and not text2:
|
||||
return 1.0
|
||||
if not text1 or not text2:
|
||||
return 0.0
|
||||
|
||||
ngrams1 = set(self._get_ngrams(text1))
|
||||
ngrams2 = set(self._get_ngrams(text2))
|
||||
|
||||
if not ngrams1 and not ngrams2:
|
||||
return 1.0
|
||||
if not ngrams1 or not ngrams2:
|
||||
return 0.0
|
||||
|
||||
intersection = len(ngrams1.intersection(ngrams2))
|
||||
union = len(ngrams1.union(ngrams2))
|
||||
|
||||
return intersection / union if union > 0 else 0.0
|
||||
|
||||
def _get_ngrams(self, text: str) -> list[str]:
|
||||
"""Generate n-grams from text."""
|
||||
if len(text) < self.n:
|
||||
return [text]
|
||||
|
||||
return [text[i : i + self.n] for i in range(len(text) - self.n + 1)]
|
||||
271
src/quality/similarity/token_based.py
Normal file
271
src/quality/similarity/token_based.py
Normal file
@@ -0,0 +1,271 @@
|
||||
"""Token-based similarity algorithms."""
|
||||
|
||||
import math
|
||||
from collections import Counter
|
||||
|
||||
from ..config.schemas import SimilarityAlgorithmConfig
|
||||
from .base import BaseSimilarityAlgorithm
|
||||
|
||||
|
||||
class JaccardSimilarity(BaseSimilarityAlgorithm):
|
||||
"""Jaccard similarity coefficient algorithm with enhanced tokenization."""
|
||||
|
||||
def __init__(self, config: SimilarityAlgorithmConfig | None = None):
|
||||
if config is None:
|
||||
config = SimilarityAlgorithmConfig(name="jaccard", weight=0.3)
|
||||
super().__init__(config)
|
||||
|
||||
def calculate(self, text1: str, text2: str) -> float:
|
||||
"""Calculate similarity using Jaccard coefficient."""
|
||||
if not text1 and not text2:
|
||||
return 1.0
|
||||
if not text1 or not text2:
|
||||
return 0.0
|
||||
|
||||
tokens1 = set(self._tokenize(text1))
|
||||
tokens2 = set(self._tokenize(text2))
|
||||
|
||||
if not tokens1 and not tokens2:
|
||||
return 1.0
|
||||
if not tokens1 or not tokens2:
|
||||
return 0.0
|
||||
|
||||
intersection = len(tokens1.intersection(tokens2))
|
||||
union = len(tokens1.union(tokens2))
|
||||
|
||||
return intersection / union if union > 0 else 0.0
|
||||
|
||||
def _tokenize(self, text: str) -> list[str]:
|
||||
"""Enhanced tokenization with semantic grouping for better duplicate detection."""
|
||||
import re
|
||||
|
||||
# Python keywords and built-ins that should be preserved exactly
|
||||
keywords = {
|
||||
'def', 'class', 'if', 'else', 'elif', 'for', 'while', 'try', 'except',
|
||||
'finally', 'with', 'as', 'import', 'from', 'return', 'yield', 'pass',
|
||||
'break', 'continue', 'and', 'or', 'not', 'in', 'is', 'lambda', 'None',
|
||||
'True', 'False', 'self', 'cls', 'len', 'range', 'str', 'int', 'float',
|
||||
'list', 'dict', 'tuple', 'set', 'bool', 'append', 'extend', 'remove'
|
||||
}
|
||||
|
||||
# Semantic variable name patterns (group similar names)
|
||||
semantic_patterns = [
|
||||
(r'\b(data|item|element|val|value|obj|object|thing)\w*\b', 'DATA_VAR'),
|
||||
(r'\b(result|output|ret|return|res|response)\w*\b', 'RESULT_VAR'),
|
||||
(r'\b(index|idx|i|j|k|counter|count|num|number)\w*\b', 'INDEX_VAR'),
|
||||
(r'\b(name|id|key|identifier|label)\w*\b', 'ID_VAR'),
|
||||
(r'\b(config|settings|options|params?|args?|kwargs?)\w*\b', 'CONFIG_VAR'),
|
||||
(r'\b(path|file|dir|directory|filename)\w*\b', 'PATH_VAR'),
|
||||
(r'\b(error|err|exception|ex)\w*\b', 'ERROR_VAR'),
|
||||
(r'\b(temp|tmp|buffer|buf|cache)\w*\b', 'TEMP_VAR'),
|
||||
(r'\b(min|max|avg|sum|total|count)\w*\b', 'CALC_VAR'),
|
||||
(r'\b(user|person|client|customer)\w*\b', 'USER_VAR'),
|
||||
(r'\b(width|height|size|length|dimension)\w*\b', 'SIZE_VAR'),
|
||||
]
|
||||
|
||||
# First pass: extract all tokens
|
||||
tokens = re.findall(r"\b\w+\b", text.lower())
|
||||
|
||||
# Second pass: apply semantic grouping and filtering
|
||||
processed_tokens = []
|
||||
for token in tokens:
|
||||
if len(token) <= 1:
|
||||
continue
|
||||
|
||||
# Keep keywords and built-ins as-is
|
||||
if token in keywords:
|
||||
processed_tokens.append(token)
|
||||
continue
|
||||
|
||||
# Apply semantic patterns to group similar variable names
|
||||
matched = False
|
||||
for pattern, replacement in semantic_patterns:
|
||||
if re.match(pattern, token):
|
||||
processed_tokens.append(replacement)
|
||||
matched = True
|
||||
break
|
||||
|
||||
if not matched:
|
||||
# Generic variable abstraction for remaining identifiers
|
||||
if re.match(r'^[a-zA-Z_]\w*$', token):
|
||||
processed_tokens.append('VAR')
|
||||
else:
|
||||
processed_tokens.append(token)
|
||||
|
||||
return processed_tokens
|
||||
|
||||
|
||||
class CosineSimilarity(BaseSimilarityAlgorithm):
|
||||
"""Cosine similarity algorithm using TF-IDF vectors."""
|
||||
|
||||
def __init__(self, config: SimilarityAlgorithmConfig | None = None):
|
||||
if config is None:
|
||||
config = SimilarityAlgorithmConfig(name="cosine", weight=0.3)
|
||||
super().__init__(config)
|
||||
|
||||
def calculate(self, text1: str, text2: str) -> float:
|
||||
"""Calculate similarity using cosine similarity."""
|
||||
if not text1 and not text2:
|
||||
return 1.0
|
||||
if not text1 or not text2:
|
||||
return 0.0
|
||||
|
||||
tokens1 = self._tokenize(text1)
|
||||
tokens2 = self._tokenize(text2)
|
||||
|
||||
if not tokens1 and not tokens2:
|
||||
return 1.0
|
||||
if not tokens1 or not tokens2:
|
||||
return 0.0
|
||||
|
||||
# Create term frequency vectors
|
||||
tf1 = Counter(tokens1)
|
||||
tf2 = Counter(tokens2)
|
||||
|
||||
# Get all unique terms
|
||||
all_terms = set(tf1.keys()) | set(tf2.keys())
|
||||
|
||||
# Calculate cosine similarity
|
||||
dot_product = sum(tf1[term] * tf2[term] for term in all_terms)
|
||||
magnitude1 = math.sqrt(sum(tf1[term] ** 2 for term in all_terms))
|
||||
magnitude2 = math.sqrt(sum(tf2[term] ** 2 for term in all_terms))
|
||||
|
||||
if magnitude1 == 0 or magnitude2 == 0:
|
||||
return 0.0
|
||||
|
||||
return dot_product / (magnitude1 * magnitude2)
|
||||
|
||||
def _tokenize(self, text: str) -> list[str]:
|
||||
"""Tokenize text into words/identifiers."""
|
||||
import re
|
||||
|
||||
# Split on whitespace and common delimiters
|
||||
tokens = re.findall(r"\b\w+\b", text.lower())
|
||||
return [token for token in tokens if len(token) > 1]
|
||||
|
||||
|
||||
class TFIDFSimilarity(BaseSimilarityAlgorithm):
|
||||
"""TF-IDF based cosine similarity algorithm."""
|
||||
|
||||
def __init__(self, config: SimilarityAlgorithmConfig | None = None):
|
||||
if config is None:
|
||||
config = SimilarityAlgorithmConfig(name="tfidf", weight=0.25)
|
||||
super().__init__(config)
|
||||
|
||||
def calculate(self, text1: str, text2: str) -> float:
|
||||
"""Calculate similarity using TF-IDF weighted cosine similarity."""
|
||||
if not text1 and not text2:
|
||||
return 1.0
|
||||
if not text1 or not text2:
|
||||
return 0.0
|
||||
|
||||
tokens1 = self._tokenize(text1)
|
||||
tokens2 = self._tokenize(text2)
|
||||
|
||||
if not tokens1 and not tokens2:
|
||||
return 1.0
|
||||
if not tokens1 or not tokens2:
|
||||
return 0.0
|
||||
|
||||
# Calculate TF for both texts
|
||||
tf1 = self._calculate_tf(tokens1)
|
||||
tf2 = self._calculate_tf(tokens2)
|
||||
|
||||
# Calculate IDF for all terms
|
||||
all_terms = set(tf1.keys()) | set(tf2.keys())
|
||||
idf = self._calculate_idf(all_terms, [tokens1, tokens2])
|
||||
|
||||
# Calculate TF-IDF vectors
|
||||
tfidf1 = {term: tf1.get(term, 0) * idf[term] for term in all_terms}
|
||||
tfidf2 = {term: tf2.get(term, 0) * idf[term] for term in all_terms}
|
||||
|
||||
# Calculate cosine similarity
|
||||
dot_product = sum(tfidf1[term] * tfidf2[term] for term in all_terms)
|
||||
magnitude1 = math.sqrt(sum(tfidf1[term] ** 2 for term in all_terms))
|
||||
magnitude2 = math.sqrt(sum(tfidf2[term] ** 2 for term in all_terms))
|
||||
|
||||
if magnitude1 == 0 or magnitude2 == 0:
|
||||
return 0.0
|
||||
|
||||
return dot_product / (magnitude1 * magnitude2)
|
||||
|
||||
def _tokenize(self, text: str) -> list[str]:
|
||||
"""Tokenize text into words/identifiers."""
|
||||
import re
|
||||
|
||||
# Split on whitespace and common delimiters
|
||||
tokens = re.findall(r"\b\w+\b", text.lower())
|
||||
return [token for token in tokens if len(token) > 1]
|
||||
|
||||
def _calculate_tf(self, tokens: list[str]) -> dict[str, float]:
|
||||
"""Calculate term frequency."""
|
||||
tf = Counter(tokens)
|
||||
total_terms = len(tokens)
|
||||
return {term: count / total_terms for term, count in tf.items()}
|
||||
|
||||
def _calculate_idf(
|
||||
self, terms: set[str], documents: list[list[str]]
|
||||
) -> dict[str, float]:
|
||||
"""Calculate inverse document frequency."""
|
||||
idf = {}
|
||||
total_docs = len(documents)
|
||||
|
||||
for term in terms:
|
||||
docs_containing_term = sum(1 for doc in documents if term in doc)
|
||||
idf[term] = math.log(
|
||||
total_docs / (docs_containing_term + 1)
|
||||
) # +1 for smoothing
|
||||
|
||||
return idf
|
||||
|
||||
|
||||
class ShingleSimilarity(BaseSimilarityAlgorithm):
|
||||
"""Shingle-based similarity algorithm."""
|
||||
|
||||
def __init__(self, config: SimilarityAlgorithmConfig | None = None):
|
||||
if config is None:
|
||||
config = SimilarityAlgorithmConfig(
|
||||
name="shingle", weight=0.2, parameters={"k": 4}
|
||||
)
|
||||
super().__init__(config)
|
||||
k_param = self.config.parameters.get("k", 4)
|
||||
self.k: int = int(k_param) if isinstance(k_param, (int, float, str)) else 4
|
||||
|
||||
def calculate(self, text1: str, text2: str) -> float:
|
||||
"""Calculate similarity using k-shingles."""
|
||||
if not text1 and not text2:
|
||||
return 1.0
|
||||
if not text1 or not text2:
|
||||
return 0.0
|
||||
|
||||
shingles1 = set(self._get_shingles(text1))
|
||||
shingles2 = set(self._get_shingles(text2))
|
||||
|
||||
if not shingles1 and not shingles2:
|
||||
return 1.0
|
||||
if not shingles1 or not shingles2:
|
||||
return 0.0
|
||||
|
||||
intersection = len(shingles1.intersection(shingles2))
|
||||
union = len(shingles1.union(shingles2))
|
||||
|
||||
return intersection / union if union > 0 else 0.0
|
||||
|
||||
def _get_shingles(self, text: str) -> list[str]:
|
||||
"""Generate k-shingles from text."""
|
||||
tokens = self._tokenize(text)
|
||||
|
||||
if len(tokens) < self.k:
|
||||
return [" ".join(tokens)]
|
||||
|
||||
return [
|
||||
" ".join(tokens[i : i + self.k]) for i in range(len(tokens) - self.k + 1)
|
||||
]
|
||||
|
||||
def _tokenize(self, text: str) -> list[str]:
|
||||
"""Tokenize text into words/identifiers."""
|
||||
import re
|
||||
|
||||
# Split on whitespace and common delimiters
|
||||
tokens = re.findall(r"\b\w+\b", text.lower())
|
||||
return [token for token in tokens if len(token) > 1]
|
||||
1
src/quality/utils/__init__.py
Normal file
1
src/quality/utils/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
"""Utility modules for the quality analysis package."""
|
||||
222
src/quality/utils/file_finder.py
Normal file
222
src/quality/utils/file_finder.py
Normal file
@@ -0,0 +1,222 @@
|
||||
"""File discovery utilities for quality analysis."""
|
||||
|
||||
import fnmatch
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from ..config.schemas import LanguageConfig, PathConfig
|
||||
|
||||
|
||||
class FileFinder:
|
||||
"""Finds relevant source files for analysis."""
|
||||
|
||||
def __init__(self, path_config: PathConfig, language_config: LanguageConfig):
|
||||
self.path_config = path_config
|
||||
self.language_config = language_config
|
||||
|
||||
def find_files(self, root_path: Path) -> list[Path]:
|
||||
"""Find all relevant source files in the given path."""
|
||||
if not root_path.exists():
|
||||
return []
|
||||
|
||||
if root_path.is_file():
|
||||
return [root_path] if self._should_include_file(root_path) else []
|
||||
|
||||
found_files = []
|
||||
files_processed = 0
|
||||
|
||||
# Get all supported extensions
|
||||
extensions = set()
|
||||
for lang in self.language_config.languages:
|
||||
if lang in self.language_config.file_extensions:
|
||||
extensions.update(self.language_config.file_extensions[lang])
|
||||
|
||||
# Walk through directory
|
||||
for file_path in root_path.rglob("*"):
|
||||
if not file_path.is_file():
|
||||
continue
|
||||
|
||||
# Check max files limit
|
||||
if (
|
||||
self.path_config.max_files is not None
|
||||
and files_processed >= self.path_config.max_files
|
||||
):
|
||||
break
|
||||
|
||||
# Check if file should be included
|
||||
if self._should_include_file(file_path):
|
||||
found_files.append(file_path)
|
||||
files_processed += 1
|
||||
|
||||
return found_files
|
||||
|
||||
def find_python_files(self, root_path: Path) -> list[Path]:
|
||||
"""Find only Python files."""
|
||||
if not root_path.exists():
|
||||
return []
|
||||
|
||||
if root_path.is_file():
|
||||
return [root_path] if self._is_python_file(root_path) else []
|
||||
|
||||
found_files = []
|
||||
for file_path in root_path.rglob("*.py"):
|
||||
if self._should_include_file(file_path) and self._is_python_file(file_path):
|
||||
found_files.append(file_path)
|
||||
|
||||
return found_files
|
||||
|
||||
def _should_include_file(self, file_path: Path) -> bool:
|
||||
"""Check if a file should be included in analysis."""
|
||||
path_str = str(file_path)
|
||||
|
||||
# Check exclude patterns first
|
||||
for pattern in self.path_config.exclude_patterns:
|
||||
if fnmatch.fnmatch(path_str, pattern) or fnmatch.fnmatch(
|
||||
file_path.name, pattern
|
||||
):
|
||||
return False
|
||||
|
||||
# Check include patterns
|
||||
for pattern in self.path_config.include_patterns:
|
||||
if fnmatch.fnmatch(path_str, pattern) or fnmatch.fnmatch(
|
||||
file_path.name, pattern
|
||||
):
|
||||
# Check if it's a supported file type
|
||||
return self._has_supported_extension(file_path)
|
||||
|
||||
return False
|
||||
|
||||
def _has_supported_extension(self, file_path: Path) -> bool:
|
||||
"""Check if file has a supported extension."""
|
||||
suffix = file_path.suffix.lower()
|
||||
|
||||
for lang in self.language_config.languages:
|
||||
if (
|
||||
lang in self.language_config.file_extensions
|
||||
and suffix in self.language_config.file_extensions[lang]
|
||||
):
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def _is_python_file(self, file_path: Path) -> bool:
|
||||
"""Check if file is a Python file."""
|
||||
return file_path.suffix.lower() in [".py", ".pyx", ".pyi"]
|
||||
|
||||
def get_file_language(self, file_path: Path) -> str | None:
|
||||
"""Determine the programming language of a file."""
|
||||
suffix = file_path.suffix.lower()
|
||||
|
||||
for lang, extensions in self.language_config.file_extensions.items():
|
||||
if suffix in extensions:
|
||||
return lang
|
||||
|
||||
return None
|
||||
|
||||
def get_project_stats(self, root_path: Path) -> dict[str, Any]:
|
||||
"""Get statistics about files in the project."""
|
||||
stats = {
|
||||
"total_files": 0,
|
||||
"supported_files": 0,
|
||||
"excluded_files": 0,
|
||||
"by_language": {},
|
||||
}
|
||||
|
||||
if not root_path.exists():
|
||||
return stats
|
||||
|
||||
# Initialize language counters
|
||||
for lang in self.language_config.languages:
|
||||
stats["by_language"][lang] = 0
|
||||
|
||||
# Walk through all files
|
||||
for file_path in root_path.rglob("*"):
|
||||
if not file_path.is_file():
|
||||
continue
|
||||
|
||||
stats["total_files"] += 1
|
||||
|
||||
if self._should_include_file(file_path):
|
||||
stats["supported_files"] += 1
|
||||
lang = self.get_file_language(file_path)
|
||||
if lang and lang in stats["by_language"]:
|
||||
stats["by_language"][lang] += 1
|
||||
else:
|
||||
stats["excluded_files"] += 1
|
||||
|
||||
return stats
|
||||
|
||||
def filter_files_by_patterns(
|
||||
self,
|
||||
files: list[Path],
|
||||
include_patterns: list[str] | None = None,
|
||||
exclude_patterns: list[str] | None = None,
|
||||
) -> list[Path]:
|
||||
"""Filter files by additional patterns."""
|
||||
filtered = []
|
||||
|
||||
for file_path in files:
|
||||
path_str = str(file_path)
|
||||
include = True
|
||||
|
||||
# Apply exclude patterns
|
||||
if exclude_patterns:
|
||||
for pattern in exclude_patterns:
|
||||
if fnmatch.fnmatch(path_str, pattern) or fnmatch.fnmatch(
|
||||
file_path.name, pattern
|
||||
):
|
||||
include = False
|
||||
break
|
||||
|
||||
# Apply include patterns
|
||||
if include and include_patterns:
|
||||
include = False
|
||||
for pattern in include_patterns:
|
||||
if fnmatch.fnmatch(path_str, pattern) or fnmatch.fnmatch(
|
||||
file_path.name, pattern
|
||||
):
|
||||
include = True
|
||||
break
|
||||
|
||||
if include:
|
||||
filtered.append(file_path)
|
||||
|
||||
return filtered
|
||||
|
||||
def get_file_size_stats(self, files: list[Path]) -> dict[str, int]:
|
||||
"""Get file size statistics."""
|
||||
sizes = []
|
||||
total_size = 0
|
||||
total_lines = 0
|
||||
|
||||
for file_path in files:
|
||||
try:
|
||||
size = file_path.stat().st_size
|
||||
sizes.append(size)
|
||||
total_size += size
|
||||
|
||||
# Count lines
|
||||
with open(file_path, encoding="utf-8") as f:
|
||||
lines = sum(1 for _ in f)
|
||||
total_lines += lines
|
||||
except (OSError, UnicodeDecodeError):
|
||||
continue
|
||||
|
||||
if not sizes:
|
||||
return {
|
||||
"total_files": 0,
|
||||
"total_size_bytes": 0,
|
||||
"total_lines": 0,
|
||||
"average_size_bytes": 0,
|
||||
"average_lines_per_file": 0,
|
||||
}
|
||||
|
||||
return {
|
||||
"total_files": len(sizes),
|
||||
"total_size_bytes": total_size,
|
||||
"total_lines": total_lines,
|
||||
"average_size_bytes": total_size // len(sizes),
|
||||
"average_lines_per_file": total_lines // len(sizes),
|
||||
"largest_file_bytes": max(sizes),
|
||||
"smallest_file_bytes": min(sizes),
|
||||
}
|
||||
Reference in New Issue
Block a user