Advertisement
jargon

7z Ripper.ps1

Mar 11th, 2025 (edited)
135
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. # "7z Ripper.ps1" - Copyright Tim Keal alias jargon 2025 03/11
  2. # This script scans for 7zip archives and lists file names matched by similarity threshold.
  3. # This script is for Powershell 7 ("pwsh.exe" not "Powershell.exe")
  4.  
  5. # Requires 7-Zip installed
  6. $SevenZipPath = "C:\Program Files\7-Zip\7z.exe"
  7.  
  8. if (-not (Test-Path $SevenZipPath -PathType Leaf)) {
  9.     Write-Host "Error: 7-Zip not found."
  10.     exit 1
  11. }
  12.  
  13. function Convert-UpperCaseSeries {
  14.     param (
  15.         [string]$inputString
  16.     )
  17.  
  18.     if ($inputString.Length -gt 1) {
  19.         return $inputString.Substring(0,1) + $inputString.Substring(1).ToLower()
  20.     }
  21.    
  22.     return $inputString
  23. }
  24.  
  25. # Function to calculate Levenshtein Distance as a percentile
  26. function Get-LevenshteinSimilarity {
  27.     param ([string]$source, [string]$target)
  28.    
  29.     $sourceLength = $source.Length
  30.     $targetLength = $target.Length
  31.    
  32.     if ($sourceLength -eq 0 -and $targetLength -eq 0) { return 100.000000 }
  33.    
  34.     $distanceMatrix = @()
  35.     for ($i = 0; $i -le $sourceLength; $i++) {
  36.         $distanceMatrix += ,(@(0) * ($targetLength + 1))  
  37.     }
  38.    
  39.     for ($i = 0; $i -le $sourceLength; $i++) { $distanceMatrix[$i][0] = $i }
  40.     for ($j = 0; $j -le $targetLength; $j++) { $distanceMatrix[0][$j] = $j }
  41.    
  42.     for ($i = 1; $i -le $sourceLength; $i++) {
  43.         for ($j = 1; $j -le $targetLength; $j++) {
  44.             $cost = if ($source[$i - 1] -eq $target[$j - 1]) { 0 } else { 1 }
  45.            
  46.             $delete = $distanceMatrix[$i - 1][$j] + 1
  47.             $insert = $distanceMatrix[$i][$j - 1] + 1
  48.             $substitute = $distanceMatrix[$i - 1][$j - 1] + $cost
  49.            
  50.             $distanceMatrix[$i][$j] = [math]::Min([math]::Min($delete, $insert), $substitute)
  51.         }
  52.     }
  53.    
  54.     $distance = $distanceMatrix[$sourceLength][$targetLength]
  55.     $maxLength = [math]::Max($sourceLength, $targetLength)
  56.     $similarity = ((1 - ($distance / $maxLength)) * 100)
  57.     return [math]::Round($similarity, 6)
  58. }
  59.  
  60. # Function to scan inside .7z files that meet archive similarity criteria
  61. function Scan-7zArchives {
  62.     param (
  63.         [string]$searchTerm,
  64.         [string]$path = "./",
  65.         [double]$threshold = 70,
  66.         [string]$archiveTestString = "",
  67.         [double]$archiveThreshold = 70
  68.     )
  69.    
  70.     $basePath = Resolve-Path -Path $path
  71.     $archives = Get-ChildItem -Path $basePath -Filter "*.7z" -Recurse -File | Where-Object { $_.FullName -notmatch '.*\.git.*' }
  72.     $outputFile = "$basePath\7z_scan_results.txt"
  73.     $hits = @()
  74.     $EarlyExit = 0
  75.  
  76.     foreach ($archive in $archives) {
  77.  
  78.         if($EarlyExit -eq 1) { break; }
  79.  
  80.         # Check if a key is pressed
  81.         if ([System.Console]::KeyAvailable) {
  82.             $key = [System.Console]::ReadKey($true)  # Read the key without displaying it
  83.  
  84.             if ($key.Key -eq "Escape") {
  85.                 Write-Host "Escape key pressed. Exiting..."
  86.                 $EarlyExit = 1
  87.             }
  88.         }
  89.  
  90.         $archiveNameOnly = [System.IO.Path]::GetFileName($archive.Name)
  91.        
  92.         $pattern = '\b\d{4}-\d{4}-\d{4}(?:-\d{4})?\b'
  93.         if ($archiveNameOnly -match $pattern) {
  94.             $timestamp = $matches[0]
  95.         } else {
  96.             $timestamp = "NO_TIMESTAMP"
  97.         }
  98.        
  99.         $pattern = "^(.*) " + [regex]::Escape($timestamp)
  100.         if ($archiveNameOnly -cmatch $pattern) {
  101.             $project = $matches[1]
  102.         } else {
  103.             $project = "UNKNOWN_PROJECT"
  104.         }
  105.        
  106.         $cleanedName = $archiveNameOnly -replace $pattern, "" -replace "\s{2,}", " " -replace "^\s+|\s+$", ""
  107.        
  108.         $pattern = '.*\((.*?)\).*'
  109.         if ($cleanedName -cmatch $pattern) {
  110.             $cleanedName = $matches[1]
  111.         }
  112.        
  113.         $patterns = @(
  114.             '(?-i)\bAUTOMATIC BACKUP\b',
  115.             '(?-i)\b[A-Z]{3,}|OK\b',
  116.             ' \+ ',
  117.             '^\+',
  118.             '\+$',
  119.             ' \+ ',
  120.             '\+'
  121.         )
  122.        
  123.         foreach ($p in $patterns) {
  124.             $cleanedName = $cleanedName -replace $p, " " -replace "\s{2,}", " " -replace "^\s+|\s+$", ""
  125.         }
  126.        
  127.         $pattern = '([A-Z]+)'
  128.         $cleanedName = $cleanedName -replace $pattern, { Convert-UpperCaseSeries $_.Value }
  129.        
  130.         $cleanedName = $cleanedName.Trim()
  131.        
  132.         # Write-Output "$project $timestamp $cleanedName"
  133.  
  134.         $archiveSimilarity = Get-LevenshteinSimilarity -source $archiveTestString -target $cleanedName
  135.        
  136.         $formattedArchiveSimilarity = $archiveSimilarity.ToString("000.00000000")
  137.        
  138.         $dumpName = "$formattedArchiveSimilarity% > $project > $timestamp > $cleanedName"
  139.        
  140.         if ($archiveSimilarity -lt $archiveThreshold) {
  141.             Write-Host "Skipping: $dumpName"
  142.             continue
  143.         }
  144.  
  145.         Write-Host "Scanning: $dumpName"
  146.        
  147.         $output = & $SevenZipPath l -ba "$($archive.FullName)" 2>$null
  148.  
  149.         foreach ($line in $output) {
  150.  
  151.             # Check if a key is pressed
  152.             if ([System.Console]::KeyAvailable) {
  153.                 $key = [System.Console]::ReadKey($true)  # Read the key without displaying it
  154.    
  155.                 if ($key.Key -eq "Escape") {
  156.                     $EarlyExit = 1
  157.                 }
  158.             }
  159.            
  160.             if($EarlyExit -eq 1) { break; }
  161.            
  162.             $fileNameOnly = $line -replace '^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}\s+\S+\s+\d+\s+', ''
  163.            
  164.             if ($fileNameOnly -match '^\s*$' -or $line -match 'D\.\.\.\.') { continue }
  165.            
  166.             if ( $fileNameOnly -match '^(.*(\\|\/)){0,}\.(git|vs)((\\|\/).*){0,}$' ) { continue; }
  167.            
  168.             if ( $fileNameOnly -notmatch "\.($searchExtensions)$" ) { continue; }
  169.  
  170.             Write-Host "Checking: $fileNameOnly"
  171.             $fileNameOnly = [System.IO.Path]::GetFileName($fileNameOnly)
  172.  
  173.             $similarity = Get-LevenshteinSimilarity -source $searchTerm -target $fileNameOnly
  174.             $formattedSimilarity = $similarity.ToString("000.00000000")
  175.             if ($similarity -ge $threshold) {
  176.                 $matchString = "$formattedSimilarity% > $archiveNameOnly > $fileNameOnly"
  177.                 Write-Host "Found: $matchString"
  178.                 $hits += $matchString
  179.             }
  180.         }
  181.     }
  182.  
  183.     $sortedHits = $hits | Sort-Object {
  184.         $similarity = 0
  185.         if ($_ -match '^(\d+\.\d+)%') {
  186.             $similarity = [double]$matches[1]  # Corrected: Extract similarity from the captured match
  187.         }
  188.         $similarity
  189.     } -Descending
  190.    
  191.     $hitCount = $hits.Count
  192.    
  193.     # Write to output file
  194.     Set-Content -Path $outputFile -Value "7z Archive Scan Results`r`nHits: $hitCount`r`n====================================`r`n"
  195.     Add-Content -Path $outputFile -Value ($sortedHits -join "`r`n")
  196.  
  197.     Write-Host "Scan complete. $hitCount results saved to: $outputFile"
  198. }
  199.  
  200. # User input handling
  201. $searchTerm = Read-Host "Enter filename to search"
  202. if (-not $searchTerm) { $searchTerm = "Connection" }
  203.  
  204. $searchPath = Read-Host "Enter path to search in (default is current)"
  205. if (-not $searchPath) { $searchPath = "./" }
  206.  
  207. $searchThreshold = Read-Host "Threshold in percent for file match"
  208. if (-not $searchThreshold) { $searchThreshold = "90" }
  209. $searchThreshold = [double]$searchThreshold
  210.  
  211. $archiveTestString = Read-Host "Enter test string for archive filtering"
  212. if (-not $archiveTestString) { $archiveTestString = "Jp Sys" }
  213.  
  214. $archiveThreshold = Read-Host "Threshold in percent for archive match"
  215. if (-not $archiveThreshold) { $archiveThreshold = "30" }
  216. $archiveThreshold = [double]$archiveThreshold
  217.  
  218. $searchExtensions = Read-Host "Valid extensions separated by commas"
  219. if (-not $searchExtensions) { $searchExtensions = "js,json,css,html,png" }
  220.  
  221. $searchExtensions = $searchExtensions -replace '\s*,\s*', '|'
  222.  
  223. Scan-7zArchives -searchTerm $searchTerm -path $searchPath -threshold $searchThreshold -archiveTestString $archiveTestString -archiveThreshold $archiveThreshold
  224.  
  225. Read-Host "Press Enter to exit"
  226.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement