analyse fdupes

#!/bin/python3
#
# analyse the output of fdupes
# 1. run fdupes -rS * > fdupesoutput
# 2. run this script for a summary
#
# The above fdupes command give output such as:
# 5060855 bytes each:
# ./Photos/Photos 2011/IMG_1740.JPG
# ./Photos/Uploaded Photos/IMG_1740.JPG
#
# This python script counts the number of duplicate files and
# the total amount of space wasted by the duplicate
#

INPUT="fdupesoutput"
byte_count=0
duplicate_count=0
current_file_size=0
header = True # true if we are reading the header, false if counting files

with open(INPUT) as f:
  for line in f:
    if line.strip() == '':
      header = True
      continue
    if header == True:
      current_file_size = int(line[:line.find(" ")])
      header = False
      # we don't want to count the "original", so pre-emptively remove
      # one file's worth of size and quantity from the tally
      byte_count -= current_file_size
      duplicate_count -= 1
    else:
      byte_count += current_file_size
      duplicate_count += 1

print("Count duplicate files: ", duplicate_count)
print("Wasted space (MB):     ", byte_count / 1000000)