change hashing method, update progress by size where it makes sense.

2024-08-20 22:13:20 -04:00 · 2024-08-20 22:13:20 -04:00 · dd35353068
parent dc9458a890
commit dd35353068
1 changed files with 134 additions and 19 deletions
--- a/import_media.py
+++ b/import_media.py
@ -20,6 +20,7 @@ from pprint import pprint
 import argparse
 import shutil
 import hashlib
 import xxhash
 from datetime import datetime
 from tqdm import tqdm
 import yaml
@ -36,6 +37,7 @@ try:
 except FileNotFoundError:
    print("Configuration file not found: ", CONFIG_FILE)
    print("Copy config.yaml.EXAMPLE to ", CONFIG_FILE, " and update accordingly.")
    sys.exit()
 parser = argparse.ArgumentParser()
 parser.add_argument("-e", "--event",                help = "Event Name")
@ -77,9 +79,18 @@ if args.destination:
 def dump_yaml(dictionary, file):
    """ dump a dictionary to a yaml file """
    one_million = 1000**2
    with open(file, 'w') as f:
-        yaml.dump(dictionary, f)
+        yaml.dump(
                dictionary, f,
                default_flow_style=False,
                width=one_million)
 def is_file(file):
    """ Determine if the object is a file. """
    return bool(os.path.isfile(file))
 '''
 def md5_hash(file):
    """ calculates and returns md5 hash  """
    if config['verify_checksum']:
@ -90,10 +101,33 @@ def md5_hash(file):
    else:
        md5 = 'no_verify'
    return md5
 '''
 def xx_hash(file):
    """ calculates and returns file hash based on xxHash """
    if config['verify_checksum']:
        size = os.path.getsize(file)
        hasher = xxhash.xxh64()
        with open(file, 'rb') as f:
            with tqdm(total=size,
                      unit='B',
                      unit_scale=True,
                      desc=f'Getting hash for {os.path.basename(file)}') as pbar:
                for chunk in iter(lambda: f.read(4096), b""):
                    hasher.update(chunk)
                    pbar.update(len(chunk))
        file_hash = hasher.hexdigest()
    else:
        file_hash = 'no_verify'
    return file_hash
 def cmp_files(file_1,file_2):
    """ Use file hashes to compare files """
-    return md5_hash(file_1) == md5_hash(file_2)
+    hash1 = xx_hash(file_1)
    hash2 = xx_hash(file_2)
    print(f'\n{hash1}')
    print(f'\n{hash2}')
    return hash1 == hash2
 def get_capture_date(path, f_type):
    """ get capture date from meta """
@ -131,14 +165,30 @@ def get_capture_date(path, f_type):
            stamp = datetime.strptime(
                    str('1900:01:01 00:00:00'), '%Y:%m:%d %H:%M:%S')
    elif f_type == 'video':
        try:
            stamp = datetime.strptime(
                    ffmpeg.probe(path)['format']['tags']['creation_time'],
                        '%Y-%m-%dT%H:%M:%S.%f%z')
        except:
            print(f"\n{path} had an error.  Please inspect the file and try again.")
            sys.exit()
    elif f_type == 'audio':
-        stamp = datetime.strptime(
+        try:
-                ffmpeg.probe(path)['format']['tags']['date'], '%Y-%m-%d')
+            stamp = datetime.strptime(ffmpeg.probe(
-    else:
+                path)['format']['tags']['date'], '%Y-%m-%d')
        except KeyError as ke:
            print(f'\nError: {ke} for {path}.  Trying getctime...')
            try:
                stamp = datetime.fromtimestamp(os.path.getctime(path))
            except:
                print(f'\nCould not get timestamp for {path}.  Giving up.')
                sys.exit()
    else:
        try:
            stamp = datetime.fromtimestamp(os.path.getctime(path))
        except:
            print(f'\nCould not get timestamp for {path}.  Giving up.')
            sys.exit()
    year = stamp.strftime("%Y")
    month = stamp.strftime("%m")
@ -184,26 +234,49 @@ def create_folder(file):
    elif is_dir(file) is False:
        pass # this needs to turn into bailing out as there is a collision.
 def copy_with_progress(s,d,f):
    """ Copy a file with the progress bar """
    size = os.path.getsize(s)
    with open(s, 'rb') as fs:
        with open(d, 'wb') as fd:
            with tqdm(total=size, unit='B', unit_scale=True, desc=f'Copying {f}') as pbar:
                while True:
                    chunk = fs.read(4096)
                    if not chunk:
                        break
                    fd.write(chunk)
                    pbar.update(len(chunk))
 def copy_from_source(source_path,dest_path,file_name):
    """ Copy file from source to destination """
    file_exists = path_exists(os.path.join(dest_path,file_name))
    if file_exists is True:
        print(f'\nFound {file_name} at destination, checking if they match.')
        check_match = cmp_files(os.path.join(source_path,file_name),
                            os.path.join(dest_path, file_name))
        if check_match is False:
-            print(f'Found duplicate for {source_path}, renaming destination with md5 appended.')
+            print(f'\nFound duplicate for {source_path}/{file_name}, \
                  renaming destination with hash appended.')
            base, extension = os.path.splitext(file_name)
-            md5 = md5_hash(os.path.join(dest_path, file_name))
+            #md5 = md5_hash(os.path.join(dest_path, file_name))
-            file_name_hash = base + '_' + md5 + extension
+            f_xxhash = xx_hash(os.path.join(dest_path, file_name))
            #file_name_hash = base + '_' + md5 + extension
            file_name_hash = base + '_' + f_xxhash + extension
            os.rename(os.path.join(dest_path, file_name),
                    os.path.join(dest_path, file_name_hash))
        else:
            print(f'\n{file_name} hashes match')
            return
    create_folder(dest_path)
-    shutil.copy(os.path.join(source_path,file_name), dest_path)
+    #shutil.copy(os.path.join(source_path,file_name), dest_path)
    copy_with_progress(os.path.join(source_path,file_name),
                       os.path.join(dest_path,file_name),
                       file_name)
    os.system('clear')
 def process_file(path, f_type, f_name, ext):
    """ gather information and add to dictionary """
@ -269,13 +342,18 @@ def process_file(path, f_type, f_name, ext):
 def find_files(directory):
    """ find files to build a dictionary out of """
    os.system('clear')
    for folder, subfolders, filename in os.walk(directory):
        for f_type in config['file_types']:
            for ext in config['file_types'][f_type]:
-                for file in tqdm(filename, desc = 'Finding ' + ext + ' Files', ncols = 100):
+                for file in tqdm(filename,
                                 desc = 'Finding ' + ext + ' Files in ' + folder):
                    if file.lower().endswith(ext):
-                        # print(file)
+                        current_file = os.path.join(folder,file)
                        if is_file(current_file):
                            process_file(folder, f_type, file, ext)
                        else:
                            print(f"Skipping {current_file} as it does not look like a real file.")
 def validate_config_dir_access():
    """ Validate we can operate in the defined directories """
@ -299,7 +377,8 @@ def validate_config_dir_access():
 def copy_files():
    """ Copy Files. """
-    for file in tqdm(files, desc = "Copying Files:", ncols = 100):
+    os.system('clear')
    for file in tqdm(files, desc = "Copying Files:"):
        create_folder(files[file]['folders']['destination'])
        copy_from_source(files[file]['folders']['source_path'],
@ -314,6 +393,7 @@ def copy_files():
                                 files[file]['folders']['destination_original'],
                                 files[file]['name'])
 '''
 def gen_hashes():
    """ Generate Hashes """
    for file in tqdm(files, desc = "Generating MD5 Hashes:", ncols = 100):
@ -322,7 +402,20 @@ def gen_hashes():
        for folder in files[file]['folders']:
            k = os.path.join(files[file]['folders'][folder], files[file]['name'])
            files[file]['md5_checksums'][k] = md5_hash(k)
 '''
 def gen_xxhashes():
    """ Generate xxHashes """
    os.system('clear')
    for file in tqdm(files, desc = "Generating xx Hashes:"):
        #print(files[file])
        files[file]['xx_checksums'] = {}
        for folder in files[file]['folders']:
            k = os.path.join(files[file]['folders'][folder], files[file]['name'])
            files[file]['xx_checksums'][k] = xx_hash(k)
            print(f"{k}: {files[file]['xx_checksums'][k]}")
 '''
 def validate_checksums():
    """ Validate Checksums """
    for file in tqdm(files, desc = "Verifying Checksums:", ncols = 100):
@ -341,11 +434,33 @@ def validate_checksums():
                    print('\n File Meta:\n')
                    pprint(files[file])
            i = i + 1
 '''
 def validate_xx_checksums():
    """ Validate Checksums """
    os.system('clear')
    for file in tqdm(files, desc = "Verifying Checksums:"):
        i = 0
        c = {}
        for checksum in files[file]['xx_checksums']:
            c[i] = files[file]['xx_checksums'][checksum]
            if i > 0:
                p = i - 1
                if c[i] == c[p]:
                    files[file]['source_cleanable'] = True
                else:
                    files[file]['source_cleanable'] = False
                    print(f'FATAL: Checksum validation failed for: \
                            {files[file]["name"]} \n{c[i]}\n is not equal to \n{c[p]}\n')
                    print('\n File Meta:\n')
                    pprint(files[file])
            i = i + 1
 def cleanup_sd():
    """ If we should clean up the SD, nuke the copied files. """
    if config['cleanup_sd'] is True:
-        for file in tqdm(files, desc = "Cleaning Up SD:", ncols = 100):
+        os.system('clear')
        for file in tqdm(files, desc = "Cleaning Up SD:"):
            if files[file]['source_cleanable'] is True:
                os.remove(os.path.join(files[file]['folders']['source_path'],files[file]['name']))
@ -353,8 +468,8 @@ GO = validate_config_dir_access()
 if GO is True:
    find_files(config['folders']['source']['base'])
    copy_files()
-    gen_hashes()
+    gen_xxhashes()
-    validate_checksums()
+    validate_xx_checksums()
    cleanup_sd()
 else:
    print("There was a problem accessing one or more directories defined in the configuration.")