Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # "7z Ripper.ps1" - Copyright Tim Keal alias jargon 2025 03/11
- # This script scans for 7zip archives and lists file names matched by similarity threshold.
- # This script is for Powershell 7 ("pwsh.exe" not "Powershell.exe")
- # Requires 7-Zip installed
- $SevenZipPath = "C:\Program Files\7-Zip\7z.exe"
- if (-not (Test-Path $SevenZipPath -PathType Leaf)) {
- Write-Host "Error: 7-Zip not found."
- exit 1
- }
- function Convert-UpperCaseSeries {
- param (
- [string]$inputString
- )
- if ($inputString.Length -gt 1) {
- return $inputString.Substring(0,1) + $inputString.Substring(1).ToLower()
- }
- return $inputString
- }
- # Function to calculate Levenshtein Distance as a percentile
- function Get-LevenshteinSimilarity {
- param ([string]$source, [string]$target)
- $sourceLength = $source.Length
- $targetLength = $target.Length
- if ($sourceLength -eq 0 -and $targetLength -eq 0) { return 100.000000 }
- $distanceMatrix = @()
- for ($i = 0; $i -le $sourceLength; $i++) {
- $distanceMatrix += ,(@(0) * ($targetLength + 1))
- }
- for ($i = 0; $i -le $sourceLength; $i++) { $distanceMatrix[$i][0] = $i }
- for ($j = 0; $j -le $targetLength; $j++) { $distanceMatrix[0][$j] = $j }
- for ($i = 1; $i -le $sourceLength; $i++) {
- for ($j = 1; $j -le $targetLength; $j++) {
- $cost = if ($source[$i - 1] -eq $target[$j - 1]) { 0 } else { 1 }
- $delete = $distanceMatrix[$i - 1][$j] + 1
- $insert = $distanceMatrix[$i][$j - 1] + 1
- $substitute = $distanceMatrix[$i - 1][$j - 1] + $cost
- $distanceMatrix[$i][$j] = [math]::Min([math]::Min($delete, $insert), $substitute)
- }
- }
- $distance = $distanceMatrix[$sourceLength][$targetLength]
- $maxLength = [math]::Max($sourceLength, $targetLength)
- $similarity = ((1 - ($distance / $maxLength)) * 100)
- return [math]::Round($similarity, 6)
- }
- # Function to scan inside .7z files that meet archive similarity criteria
- function Scan-7zArchives {
- param (
- [string]$searchTerm,
- [string]$path = "./",
- [double]$threshold = 70,
- [string]$archiveTestString = "",
- [double]$archiveThreshold = 70
- )
- $basePath = Resolve-Path -Path $path
- $archives = Get-ChildItem -Path $basePath -Filter "*.7z" -Recurse -File | Where-Object { $_.FullName -notmatch '.*\.git.*' }
- $outputFile = "$basePath\7z_scan_results.txt"
- $hits = @()
- $EarlyExit = 0
- foreach ($archive in $archives) {
- if($EarlyExit -eq 1) { break; }
- # Check if a key is pressed
- if ([System.Console]::KeyAvailable) {
- $key = [System.Console]::ReadKey($true) # Read the key without displaying it
- if ($key.Key -eq "Escape") {
- Write-Host "Escape key pressed. Exiting..."
- $EarlyExit = 1
- }
- }
- $archiveNameOnly = [System.IO.Path]::GetFileName($archive.Name)
- $pattern = '\b\d{4}-\d{4}-\d{4}(?:-\d{4})?\b'
- if ($archiveNameOnly -match $pattern) {
- $timestamp = $matches[0]
- } else {
- $timestamp = "NO_TIMESTAMP"
- }
- $pattern = "^(.*) " + [regex]::Escape($timestamp)
- if ($archiveNameOnly -cmatch $pattern) {
- $project = $matches[1]
- } else {
- $project = "UNKNOWN_PROJECT"
- }
- $cleanedName = $archiveNameOnly -replace $pattern, "" -replace "\s{2,}", " " -replace "^\s+|\s+$", ""
- $pattern = '.*\((.*?)\).*'
- if ($cleanedName -cmatch $pattern) {
- $cleanedName = $matches[1]
- }
- $patterns = @(
- '(?-i)\bAUTOMATIC BACKUP\b',
- '(?-i)\b[A-Z]{3,}|OK\b',
- ' \+ ',
- '^\+',
- '\+$',
- ' \+ ',
- '\+'
- )
- foreach ($p in $patterns) {
- $cleanedName = $cleanedName -replace $p, " " -replace "\s{2,}", " " -replace "^\s+|\s+$", ""
- }
- $pattern = '([A-Z]+)'
- $cleanedName = $cleanedName -replace $pattern, { Convert-UpperCaseSeries $_.Value }
- $cleanedName = $cleanedName.Trim()
- # Write-Output "$project $timestamp $cleanedName"
- $archiveSimilarity = Get-LevenshteinSimilarity -source $archiveTestString -target $cleanedName
- $formattedArchiveSimilarity = $archiveSimilarity.ToString("000.00000000")
- $dumpName = "$formattedArchiveSimilarity% > $project > $timestamp > $cleanedName"
- if ($archiveSimilarity -lt $archiveThreshold) {
- Write-Host "Skipping: $dumpName"
- continue
- }
- Write-Host "Scanning: $dumpName"
- $output = & $SevenZipPath l -ba "$($archive.FullName)" 2>$null
- foreach ($line in $output) {
- # Check if a key is pressed
- if ([System.Console]::KeyAvailable) {
- $key = [System.Console]::ReadKey($true) # Read the key without displaying it
- if ($key.Key -eq "Escape") {
- $EarlyExit = 1
- }
- }
- if($EarlyExit -eq 1) { break; }
- $fileNameOnly = $line -replace '^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}\s+\S+\s+\d+\s+', ''
- if ($fileNameOnly -match '^\s*$' -or $line -match 'D\.\.\.\.') { continue }
- if ( $fileNameOnly -match '^(.*(\\|\/)){0,}\.(git|vs)((\\|\/).*){0,}$' ) { continue; }
- if ( $fileNameOnly -notmatch "\.($searchExtensions)$" ) { continue; }
- Write-Host "Checking: $fileNameOnly"
- $fileNameOnly = [System.IO.Path]::GetFileName($fileNameOnly)
- $similarity = Get-LevenshteinSimilarity -source $searchTerm -target $fileNameOnly
- $formattedSimilarity = $similarity.ToString("000.00000000")
- if ($similarity -ge $threshold) {
- $matchString = "$formattedSimilarity% > $archiveNameOnly > $fileNameOnly"
- Write-Host "Found: $matchString"
- $hits += $matchString
- }
- }
- }
- $sortedHits = $hits | Sort-Object {
- $similarity = 0
- if ($_ -match '^(\d+\.\d+)%') {
- $similarity = [double]$matches[1] # Corrected: Extract similarity from the captured match
- }
- $similarity
- } -Descending
- $hitCount = $hits.Count
- # Write to output file
- Set-Content -Path $outputFile -Value "7z Archive Scan Results`r`nHits: $hitCount`r`n====================================`r`n"
- Add-Content -Path $outputFile -Value ($sortedHits -join "`r`n")
- Write-Host "Scan complete. $hitCount results saved to: $outputFile"
- }
- # User input handling
- $searchTerm = Read-Host "Enter filename to search"
- if (-not $searchTerm) { $searchTerm = "Connection" }
- $searchPath = Read-Host "Enter path to search in (default is current)"
- if (-not $searchPath) { $searchPath = "./" }
- $searchThreshold = Read-Host "Threshold in percent for file match"
- if (-not $searchThreshold) { $searchThreshold = "90" }
- $searchThreshold = [double]$searchThreshold
- $archiveTestString = Read-Host "Enter test string for archive filtering"
- if (-not $archiveTestString) { $archiveTestString = "Jp Sys" }
- $archiveThreshold = Read-Host "Threshold in percent for archive match"
- if (-not $archiveThreshold) { $archiveThreshold = "30" }
- $archiveThreshold = [double]$archiveThreshold
- $searchExtensions = Read-Host "Valid extensions separated by commas"
- if (-not $searchExtensions) { $searchExtensions = "js,json,css,html,png" }
- $searchExtensions = $searchExtensions -replace '\s*,\s*', '|'
- Scan-7zArchives -searchTerm $searchTerm -path $searchPath -threshold $searchThreshold -archiveTestString $archiveTestString -archiveThreshold $archiveThreshold
- Read-Host "Press Enter to exit"
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement