From dd35353068cb6c38a09d1f9a44ca50a7a10245a0 Mon Sep 17 00:00:00 2001 From: Kameron Kenny <1267885+kkenny@users.noreply.github.com> Date: Tue, 20 Aug 2024 22:13:20 -0400 Subject: [PATCH] change hashing method, update progress by size where it makes sense. --- import_media.py | 153 ++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 134 insertions(+), 19 deletions(-) diff --git a/import_media.py b/import_media.py index 2f68435..b5e2759 100755 --- a/import_media.py +++ b/import_media.py @@ -20,6 +20,7 @@ from pprint import pprint import argparse import shutil import hashlib +import xxhash from datetime import datetime from tqdm import tqdm import yaml @@ -36,6 +37,7 @@ try: except FileNotFoundError: print("Configuration file not found: ", CONFIG_FILE) print("Copy config.yaml.EXAMPLE to ", CONFIG_FILE, " and update accordingly.") + sys.exit() parser = argparse.ArgumentParser() parser.add_argument("-e", "--event", help = "Event Name") @@ -77,9 +79,18 @@ if args.destination: def dump_yaml(dictionary, file): """ dump a dictionary to a yaml file """ + one_million = 1000**2 with open(file, 'w') as f: - yaml.dump(dictionary, f) + yaml.dump( + dictionary, f, + default_flow_style=False, + width=one_million) +def is_file(file): + """ Determine if the object is a file. """ + return bool(os.path.isfile(file)) + +''' def md5_hash(file): """ calculates and returns md5 hash """ if config['verify_checksum']: @@ -90,10 +101,33 @@ def md5_hash(file): else: md5 = 'no_verify' return md5 +''' + +def xx_hash(file): + """ calculates and returns file hash based on xxHash """ + if config['verify_checksum']: + size = os.path.getsize(file) + hasher = xxhash.xxh64() + with open(file, 'rb') as f: + with tqdm(total=size, + unit='B', + unit_scale=True, + desc=f'Getting hash for {os.path.basename(file)}') as pbar: + for chunk in iter(lambda: f.read(4096), b""): + hasher.update(chunk) + pbar.update(len(chunk)) + file_hash = hasher.hexdigest() + else: + file_hash = 'no_verify' + return file_hash def cmp_files(file_1,file_2): """ Use file hashes to compare files """ - return md5_hash(file_1) == md5_hash(file_2) + hash1 = xx_hash(file_1) + hash2 = xx_hash(file_2) + print(f'\n{hash1}') + print(f'\n{hash2}') + return hash1 == hash2 def get_capture_date(path, f_type): """ get capture date from meta """ @@ -131,14 +165,30 @@ def get_capture_date(path, f_type): stamp = datetime.strptime( str('1900:01:01 00:00:00'), '%Y:%m:%d %H:%M:%S') elif f_type == 'video': - stamp = datetime.strptime( - ffmpeg.probe(path)['format']['tags']['creation_time'], - '%Y-%m-%dT%H:%M:%S.%f%z') + try: + stamp = datetime.strptime( + ffmpeg.probe(path)['format']['tags']['creation_time'], + '%Y-%m-%dT%H:%M:%S.%f%z') + except: + print(f"\n{path} had an error. Please inspect the file and try again.") + sys.exit() elif f_type == 'audio': - stamp = datetime.strptime( - ffmpeg.probe(path)['format']['tags']['date'], '%Y-%m-%d') + try: + stamp = datetime.strptime(ffmpeg.probe( + path)['format']['tags']['date'], '%Y-%m-%d') + except KeyError as ke: + print(f'\nError: {ke} for {path}. Trying getctime...') + try: + stamp = datetime.fromtimestamp(os.path.getctime(path)) + except: + print(f'\nCould not get timestamp for {path}. Giving up.') + sys.exit() else: - stamp = datetime.fromtimestamp(os.path.getctime(path)) + try: + stamp = datetime.fromtimestamp(os.path.getctime(path)) + except: + print(f'\nCould not get timestamp for {path}. Giving up.') + sys.exit() year = stamp.strftime("%Y") month = stamp.strftime("%m") @@ -184,26 +234,49 @@ def create_folder(file): elif is_dir(file) is False: pass # this needs to turn into bailing out as there is a collision. +def copy_with_progress(s,d,f): + """ Copy a file with the progress bar """ + size = os.path.getsize(s) + with open(s, 'rb') as fs: + with open(d, 'wb') as fd: + with tqdm(total=size, unit='B', unit_scale=True, desc=f'Copying {f}') as pbar: + while True: + chunk = fs.read(4096) + if not chunk: + break + fd.write(chunk) + pbar.update(len(chunk)) + def copy_from_source(source_path,dest_path,file_name): """ Copy file from source to destination """ file_exists = path_exists(os.path.join(dest_path,file_name)) if file_exists is True: + print(f'\nFound {file_name} at destination, checking if they match.') check_match = cmp_files(os.path.join(source_path,file_name), os.path.join(dest_path, file_name)) if check_match is False: - print(f'Found duplicate for {source_path}, renaming destination with md5 appended.') + print(f'\nFound duplicate for {source_path}/{file_name}, \ + renaming destination with hash appended.') base, extension = os.path.splitext(file_name) - md5 = md5_hash(os.path.join(dest_path, file_name)) - file_name_hash = base + '_' + md5 + extension + #md5 = md5_hash(os.path.join(dest_path, file_name)) + f_xxhash = xx_hash(os.path.join(dest_path, file_name)) + #file_name_hash = base + '_' + md5 + extension + file_name_hash = base + '_' + f_xxhash + extension os.rename(os.path.join(dest_path, file_name), os.path.join(dest_path, file_name_hash)) else: + print(f'\n{file_name} hashes match') return create_folder(dest_path) - shutil.copy(os.path.join(source_path,file_name), dest_path) + #shutil.copy(os.path.join(source_path,file_name), dest_path) + copy_with_progress(os.path.join(source_path,file_name), + os.path.join(dest_path,file_name), + file_name) + + os.system('clear') def process_file(path, f_type, f_name, ext): """ gather information and add to dictionary """ @@ -269,13 +342,18 @@ def process_file(path, f_type, f_name, ext): def find_files(directory): """ find files to build a dictionary out of """ + os.system('clear') for folder, subfolders, filename in os.walk(directory): for f_type in config['file_types']: for ext in config['file_types'][f_type]: - for file in tqdm(filename, desc = 'Finding ' + ext + ' Files', ncols = 100): + for file in tqdm(filename, + desc = 'Finding ' + ext + ' Files in ' + folder): if file.lower().endswith(ext): - # print(file) - process_file(folder, f_type, file, ext) + current_file = os.path.join(folder,file) + if is_file(current_file): + process_file(folder, f_type, file, ext) + else: + print(f"Skipping {current_file} as it does not look like a real file.") def validate_config_dir_access(): """ Validate we can operate in the defined directories """ @@ -299,7 +377,8 @@ def validate_config_dir_access(): def copy_files(): """ Copy Files. """ - for file in tqdm(files, desc = "Copying Files:", ncols = 100): + os.system('clear') + for file in tqdm(files, desc = "Copying Files:"): create_folder(files[file]['folders']['destination']) copy_from_source(files[file]['folders']['source_path'], @@ -314,6 +393,7 @@ def copy_files(): files[file]['folders']['destination_original'], files[file]['name']) +''' def gen_hashes(): """ Generate Hashes """ for file in tqdm(files, desc = "Generating MD5 Hashes:", ncols = 100): @@ -322,7 +402,20 @@ def gen_hashes(): for folder in files[file]['folders']: k = os.path.join(files[file]['folders'][folder], files[file]['name']) files[file]['md5_checksums'][k] = md5_hash(k) +''' +def gen_xxhashes(): + """ Generate xxHashes """ + os.system('clear') + for file in tqdm(files, desc = "Generating xx Hashes:"): + #print(files[file]) + files[file]['xx_checksums'] = {} + for folder in files[file]['folders']: + k = os.path.join(files[file]['folders'][folder], files[file]['name']) + files[file]['xx_checksums'][k] = xx_hash(k) + print(f"{k}: {files[file]['xx_checksums'][k]}") + +''' def validate_checksums(): """ Validate Checksums """ for file in tqdm(files, desc = "Verifying Checksums:", ncols = 100): @@ -341,11 +434,33 @@ def validate_checksums(): print('\n File Meta:\n') pprint(files[file]) i = i + 1 +''' + +def validate_xx_checksums(): + """ Validate Checksums """ + os.system('clear') + for file in tqdm(files, desc = "Verifying Checksums:"): + i = 0 + c = {} + for checksum in files[file]['xx_checksums']: + c[i] = files[file]['xx_checksums'][checksum] + if i > 0: + p = i - 1 + if c[i] == c[p]: + files[file]['source_cleanable'] = True + else: + files[file]['source_cleanable'] = False + print(f'FATAL: Checksum validation failed for: \ + {files[file]["name"]} \n{c[i]}\n is not equal to \n{c[p]}\n') + print('\n File Meta:\n') + pprint(files[file]) + i = i + 1 def cleanup_sd(): """ If we should clean up the SD, nuke the copied files. """ if config['cleanup_sd'] is True: - for file in tqdm(files, desc = "Cleaning Up SD:", ncols = 100): + os.system('clear') + for file in tqdm(files, desc = "Cleaning Up SD:"): if files[file]['source_cleanable'] is True: os.remove(os.path.join(files[file]['folders']['source_path'],files[file]['name'])) @@ -353,8 +468,8 @@ GO = validate_config_dir_access() if GO is True: find_files(config['folders']['source']['base']) copy_files() - gen_hashes() - validate_checksums() + gen_xxhashes() + validate_xx_checksums() cleanup_sd() else: print("There was a problem accessing one or more directories defined in the configuration.")