change hashing method, update progress by size where it makes sense.

This commit is contained in:
Kameron Kenny 2024-08-20 22:13:20 -04:00
parent dc9458a890
commit dd35353068
No known key found for this signature in database
GPG Key ID: E5006629839D2276
1 changed files with 134 additions and 19 deletions

View File

@ -20,6 +20,7 @@ from pprint import pprint
import argparse import argparse
import shutil import shutil
import hashlib import hashlib
import xxhash
from datetime import datetime from datetime import datetime
from tqdm import tqdm from tqdm import tqdm
import yaml import yaml
@ -36,6 +37,7 @@ try:
except FileNotFoundError: except FileNotFoundError:
print("Configuration file not found: ", CONFIG_FILE) print("Configuration file not found: ", CONFIG_FILE)
print("Copy config.yaml.EXAMPLE to ", CONFIG_FILE, " and update accordingly.") print("Copy config.yaml.EXAMPLE to ", CONFIG_FILE, " and update accordingly.")
sys.exit()
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument("-e", "--event", help = "Event Name") parser.add_argument("-e", "--event", help = "Event Name")
@ -77,9 +79,18 @@ if args.destination:
def dump_yaml(dictionary, file): def dump_yaml(dictionary, file):
""" dump a dictionary to a yaml file """ """ dump a dictionary to a yaml file """
one_million = 1000**2
with open(file, 'w') as f: with open(file, 'w') as f:
yaml.dump(dictionary, f) yaml.dump(
dictionary, f,
default_flow_style=False,
width=one_million)
def is_file(file):
""" Determine if the object is a file. """
return bool(os.path.isfile(file))
'''
def md5_hash(file): def md5_hash(file):
""" calculates and returns md5 hash """ """ calculates and returns md5 hash """
if config['verify_checksum']: if config['verify_checksum']:
@ -90,10 +101,33 @@ def md5_hash(file):
else: else:
md5 = 'no_verify' md5 = 'no_verify'
return md5 return md5
'''
def xx_hash(file):
""" calculates and returns file hash based on xxHash """
if config['verify_checksum']:
size = os.path.getsize(file)
hasher = xxhash.xxh64()
with open(file, 'rb') as f:
with tqdm(total=size,
unit='B',
unit_scale=True,
desc=f'Getting hash for {os.path.basename(file)}') as pbar:
for chunk in iter(lambda: f.read(4096), b""):
hasher.update(chunk)
pbar.update(len(chunk))
file_hash = hasher.hexdigest()
else:
file_hash = 'no_verify'
return file_hash
def cmp_files(file_1,file_2): def cmp_files(file_1,file_2):
""" Use file hashes to compare files """ """ Use file hashes to compare files """
return md5_hash(file_1) == md5_hash(file_2) hash1 = xx_hash(file_1)
hash2 = xx_hash(file_2)
print(f'\n{hash1}')
print(f'\n{hash2}')
return hash1 == hash2
def get_capture_date(path, f_type): def get_capture_date(path, f_type):
""" get capture date from meta """ """ get capture date from meta """
@ -131,14 +165,30 @@ def get_capture_date(path, f_type):
stamp = datetime.strptime( stamp = datetime.strptime(
str('1900:01:01 00:00:00'), '%Y:%m:%d %H:%M:%S') str('1900:01:01 00:00:00'), '%Y:%m:%d %H:%M:%S')
elif f_type == 'video': elif f_type == 'video':
try:
stamp = datetime.strptime( stamp = datetime.strptime(
ffmpeg.probe(path)['format']['tags']['creation_time'], ffmpeg.probe(path)['format']['tags']['creation_time'],
'%Y-%m-%dT%H:%M:%S.%f%z') '%Y-%m-%dT%H:%M:%S.%f%z')
except:
print(f"\n{path} had an error. Please inspect the file and try again.")
sys.exit()
elif f_type == 'audio': elif f_type == 'audio':
stamp = datetime.strptime( try:
ffmpeg.probe(path)['format']['tags']['date'], '%Y-%m-%d') stamp = datetime.strptime(ffmpeg.probe(
else: path)['format']['tags']['date'], '%Y-%m-%d')
except KeyError as ke:
print(f'\nError: {ke} for {path}. Trying getctime...')
try:
stamp = datetime.fromtimestamp(os.path.getctime(path)) stamp = datetime.fromtimestamp(os.path.getctime(path))
except:
print(f'\nCould not get timestamp for {path}. Giving up.')
sys.exit()
else:
try:
stamp = datetime.fromtimestamp(os.path.getctime(path))
except:
print(f'\nCould not get timestamp for {path}. Giving up.')
sys.exit()
year = stamp.strftime("%Y") year = stamp.strftime("%Y")
month = stamp.strftime("%m") month = stamp.strftime("%m")
@ -184,26 +234,49 @@ def create_folder(file):
elif is_dir(file) is False: elif is_dir(file) is False:
pass # this needs to turn into bailing out as there is a collision. pass # this needs to turn into bailing out as there is a collision.
def copy_with_progress(s,d,f):
""" Copy a file with the progress bar """
size = os.path.getsize(s)
with open(s, 'rb') as fs:
with open(d, 'wb') as fd:
with tqdm(total=size, unit='B', unit_scale=True, desc=f'Copying {f}') as pbar:
while True:
chunk = fs.read(4096)
if not chunk:
break
fd.write(chunk)
pbar.update(len(chunk))
def copy_from_source(source_path,dest_path,file_name): def copy_from_source(source_path,dest_path,file_name):
""" Copy file from source to destination """ """ Copy file from source to destination """
file_exists = path_exists(os.path.join(dest_path,file_name)) file_exists = path_exists(os.path.join(dest_path,file_name))
if file_exists is True: if file_exists is True:
print(f'\nFound {file_name} at destination, checking if they match.')
check_match = cmp_files(os.path.join(source_path,file_name), check_match = cmp_files(os.path.join(source_path,file_name),
os.path.join(dest_path, file_name)) os.path.join(dest_path, file_name))
if check_match is False: if check_match is False:
print(f'Found duplicate for {source_path}, renaming destination with md5 appended.') print(f'\nFound duplicate for {source_path}/{file_name}, \
renaming destination with hash appended.')
base, extension = os.path.splitext(file_name) base, extension = os.path.splitext(file_name)
md5 = md5_hash(os.path.join(dest_path, file_name)) #md5 = md5_hash(os.path.join(dest_path, file_name))
file_name_hash = base + '_' + md5 + extension f_xxhash = xx_hash(os.path.join(dest_path, file_name))
#file_name_hash = base + '_' + md5 + extension
file_name_hash = base + '_' + f_xxhash + extension
os.rename(os.path.join(dest_path, file_name), os.rename(os.path.join(dest_path, file_name),
os.path.join(dest_path, file_name_hash)) os.path.join(dest_path, file_name_hash))
else: else:
print(f'\n{file_name} hashes match')
return return
create_folder(dest_path) create_folder(dest_path)
shutil.copy(os.path.join(source_path,file_name), dest_path) #shutil.copy(os.path.join(source_path,file_name), dest_path)
copy_with_progress(os.path.join(source_path,file_name),
os.path.join(dest_path,file_name),
file_name)
os.system('clear')
def process_file(path, f_type, f_name, ext): def process_file(path, f_type, f_name, ext):
""" gather information and add to dictionary """ """ gather information and add to dictionary """
@ -269,13 +342,18 @@ def process_file(path, f_type, f_name, ext):
def find_files(directory): def find_files(directory):
""" find files to build a dictionary out of """ """ find files to build a dictionary out of """
os.system('clear')
for folder, subfolders, filename in os.walk(directory): for folder, subfolders, filename in os.walk(directory):
for f_type in config['file_types']: for f_type in config['file_types']:
for ext in config['file_types'][f_type]: for ext in config['file_types'][f_type]:
for file in tqdm(filename, desc = 'Finding ' + ext + ' Files', ncols = 100): for file in tqdm(filename,
desc = 'Finding ' + ext + ' Files in ' + folder):
if file.lower().endswith(ext): if file.lower().endswith(ext):
# print(file) current_file = os.path.join(folder,file)
if is_file(current_file):
process_file(folder, f_type, file, ext) process_file(folder, f_type, file, ext)
else:
print(f"Skipping {current_file} as it does not look like a real file.")
def validate_config_dir_access(): def validate_config_dir_access():
""" Validate we can operate in the defined directories """ """ Validate we can operate in the defined directories """
@ -299,7 +377,8 @@ def validate_config_dir_access():
def copy_files(): def copy_files():
""" Copy Files. """ """ Copy Files. """
for file in tqdm(files, desc = "Copying Files:", ncols = 100): os.system('clear')
for file in tqdm(files, desc = "Copying Files:"):
create_folder(files[file]['folders']['destination']) create_folder(files[file]['folders']['destination'])
copy_from_source(files[file]['folders']['source_path'], copy_from_source(files[file]['folders']['source_path'],
@ -314,6 +393,7 @@ def copy_files():
files[file]['folders']['destination_original'], files[file]['folders']['destination_original'],
files[file]['name']) files[file]['name'])
'''
def gen_hashes(): def gen_hashes():
""" Generate Hashes """ """ Generate Hashes """
for file in tqdm(files, desc = "Generating MD5 Hashes:", ncols = 100): for file in tqdm(files, desc = "Generating MD5 Hashes:", ncols = 100):
@ -322,7 +402,20 @@ def gen_hashes():
for folder in files[file]['folders']: for folder in files[file]['folders']:
k = os.path.join(files[file]['folders'][folder], files[file]['name']) k = os.path.join(files[file]['folders'][folder], files[file]['name'])
files[file]['md5_checksums'][k] = md5_hash(k) files[file]['md5_checksums'][k] = md5_hash(k)
'''
def gen_xxhashes():
""" Generate xxHashes """
os.system('clear')
for file in tqdm(files, desc = "Generating xx Hashes:"):
#print(files[file])
files[file]['xx_checksums'] = {}
for folder in files[file]['folders']:
k = os.path.join(files[file]['folders'][folder], files[file]['name'])
files[file]['xx_checksums'][k] = xx_hash(k)
print(f"{k}: {files[file]['xx_checksums'][k]}")
'''
def validate_checksums(): def validate_checksums():
""" Validate Checksums """ """ Validate Checksums """
for file in tqdm(files, desc = "Verifying Checksums:", ncols = 100): for file in tqdm(files, desc = "Verifying Checksums:", ncols = 100):
@ -341,11 +434,33 @@ def validate_checksums():
print('\n File Meta:\n') print('\n File Meta:\n')
pprint(files[file]) pprint(files[file])
i = i + 1 i = i + 1
'''
def validate_xx_checksums():
""" Validate Checksums """
os.system('clear')
for file in tqdm(files, desc = "Verifying Checksums:"):
i = 0
c = {}
for checksum in files[file]['xx_checksums']:
c[i] = files[file]['xx_checksums'][checksum]
if i > 0:
p = i - 1
if c[i] == c[p]:
files[file]['source_cleanable'] = True
else:
files[file]['source_cleanable'] = False
print(f'FATAL: Checksum validation failed for: \
{files[file]["name"]} \n{c[i]}\n is not equal to \n{c[p]}\n')
print('\n File Meta:\n')
pprint(files[file])
i = i + 1
def cleanup_sd(): def cleanup_sd():
""" If we should clean up the SD, nuke the copied files. """ """ If we should clean up the SD, nuke the copied files. """
if config['cleanup_sd'] is True: if config['cleanup_sd'] is True:
for file in tqdm(files, desc = "Cleaning Up SD:", ncols = 100): os.system('clear')
for file in tqdm(files, desc = "Cleaning Up SD:"):
if files[file]['source_cleanable'] is True: if files[file]['source_cleanable'] is True:
os.remove(os.path.join(files[file]['folders']['source_path'],files[file]['name'])) os.remove(os.path.join(files[file]['folders']['source_path'],files[file]['name']))
@ -353,8 +468,8 @@ GO = validate_config_dir_access()
if GO is True: if GO is True:
find_files(config['folders']['source']['base']) find_files(config['folders']['source']['base'])
copy_files() copy_files()
gen_hashes() gen_xxhashes()
validate_checksums() validate_xx_checksums()
cleanup_sd() cleanup_sd()
else: else:
print("There was a problem accessing one or more directories defined in the configuration.") print("There was a problem accessing one or more directories defined in the configuration.")