change hashing method, update progress by size where it makes sense.
This commit is contained in:
parent
dc9458a890
commit
dd35353068
153
import_media.py
153
import_media.py
|
@ -20,6 +20,7 @@ from pprint import pprint
|
||||||
import argparse
|
import argparse
|
||||||
import shutil
|
import shutil
|
||||||
import hashlib
|
import hashlib
|
||||||
|
import xxhash
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
import yaml
|
import yaml
|
||||||
|
@ -36,6 +37,7 @@ try:
|
||||||
except FileNotFoundError:
|
except FileNotFoundError:
|
||||||
print("Configuration file not found: ", CONFIG_FILE)
|
print("Configuration file not found: ", CONFIG_FILE)
|
||||||
print("Copy config.yaml.EXAMPLE to ", CONFIG_FILE, " and update accordingly.")
|
print("Copy config.yaml.EXAMPLE to ", CONFIG_FILE, " and update accordingly.")
|
||||||
|
sys.exit()
|
||||||
|
|
||||||
parser = argparse.ArgumentParser()
|
parser = argparse.ArgumentParser()
|
||||||
parser.add_argument("-e", "--event", help = "Event Name")
|
parser.add_argument("-e", "--event", help = "Event Name")
|
||||||
|
@ -77,9 +79,18 @@ if args.destination:
|
||||||
|
|
||||||
def dump_yaml(dictionary, file):
|
def dump_yaml(dictionary, file):
|
||||||
""" dump a dictionary to a yaml file """
|
""" dump a dictionary to a yaml file """
|
||||||
|
one_million = 1000**2
|
||||||
with open(file, 'w') as f:
|
with open(file, 'w') as f:
|
||||||
yaml.dump(dictionary, f)
|
yaml.dump(
|
||||||
|
dictionary, f,
|
||||||
|
default_flow_style=False,
|
||||||
|
width=one_million)
|
||||||
|
|
||||||
|
def is_file(file):
|
||||||
|
""" Determine if the object is a file. """
|
||||||
|
return bool(os.path.isfile(file))
|
||||||
|
|
||||||
|
'''
|
||||||
def md5_hash(file):
|
def md5_hash(file):
|
||||||
""" calculates and returns md5 hash """
|
""" calculates and returns md5 hash """
|
||||||
if config['verify_checksum']:
|
if config['verify_checksum']:
|
||||||
|
@ -90,10 +101,33 @@ def md5_hash(file):
|
||||||
else:
|
else:
|
||||||
md5 = 'no_verify'
|
md5 = 'no_verify'
|
||||||
return md5
|
return md5
|
||||||
|
'''
|
||||||
|
|
||||||
|
def xx_hash(file):
|
||||||
|
""" calculates and returns file hash based on xxHash """
|
||||||
|
if config['verify_checksum']:
|
||||||
|
size = os.path.getsize(file)
|
||||||
|
hasher = xxhash.xxh64()
|
||||||
|
with open(file, 'rb') as f:
|
||||||
|
with tqdm(total=size,
|
||||||
|
unit='B',
|
||||||
|
unit_scale=True,
|
||||||
|
desc=f'Getting hash for {os.path.basename(file)}') as pbar:
|
||||||
|
for chunk in iter(lambda: f.read(4096), b""):
|
||||||
|
hasher.update(chunk)
|
||||||
|
pbar.update(len(chunk))
|
||||||
|
file_hash = hasher.hexdigest()
|
||||||
|
else:
|
||||||
|
file_hash = 'no_verify'
|
||||||
|
return file_hash
|
||||||
|
|
||||||
def cmp_files(file_1,file_2):
|
def cmp_files(file_1,file_2):
|
||||||
""" Use file hashes to compare files """
|
""" Use file hashes to compare files """
|
||||||
return md5_hash(file_1) == md5_hash(file_2)
|
hash1 = xx_hash(file_1)
|
||||||
|
hash2 = xx_hash(file_2)
|
||||||
|
print(f'\n{hash1}')
|
||||||
|
print(f'\n{hash2}')
|
||||||
|
return hash1 == hash2
|
||||||
|
|
||||||
def get_capture_date(path, f_type):
|
def get_capture_date(path, f_type):
|
||||||
""" get capture date from meta """
|
""" get capture date from meta """
|
||||||
|
@ -131,14 +165,30 @@ def get_capture_date(path, f_type):
|
||||||
stamp = datetime.strptime(
|
stamp = datetime.strptime(
|
||||||
str('1900:01:01 00:00:00'), '%Y:%m:%d %H:%M:%S')
|
str('1900:01:01 00:00:00'), '%Y:%m:%d %H:%M:%S')
|
||||||
elif f_type == 'video':
|
elif f_type == 'video':
|
||||||
stamp = datetime.strptime(
|
try:
|
||||||
ffmpeg.probe(path)['format']['tags']['creation_time'],
|
stamp = datetime.strptime(
|
||||||
'%Y-%m-%dT%H:%M:%S.%f%z')
|
ffmpeg.probe(path)['format']['tags']['creation_time'],
|
||||||
|
'%Y-%m-%dT%H:%M:%S.%f%z')
|
||||||
|
except:
|
||||||
|
print(f"\n{path} had an error. Please inspect the file and try again.")
|
||||||
|
sys.exit()
|
||||||
elif f_type == 'audio':
|
elif f_type == 'audio':
|
||||||
stamp = datetime.strptime(
|
try:
|
||||||
ffmpeg.probe(path)['format']['tags']['date'], '%Y-%m-%d')
|
stamp = datetime.strptime(ffmpeg.probe(
|
||||||
|
path)['format']['tags']['date'], '%Y-%m-%d')
|
||||||
|
except KeyError as ke:
|
||||||
|
print(f'\nError: {ke} for {path}. Trying getctime...')
|
||||||
|
try:
|
||||||
|
stamp = datetime.fromtimestamp(os.path.getctime(path))
|
||||||
|
except:
|
||||||
|
print(f'\nCould not get timestamp for {path}. Giving up.')
|
||||||
|
sys.exit()
|
||||||
else:
|
else:
|
||||||
stamp = datetime.fromtimestamp(os.path.getctime(path))
|
try:
|
||||||
|
stamp = datetime.fromtimestamp(os.path.getctime(path))
|
||||||
|
except:
|
||||||
|
print(f'\nCould not get timestamp for {path}. Giving up.')
|
||||||
|
sys.exit()
|
||||||
|
|
||||||
year = stamp.strftime("%Y")
|
year = stamp.strftime("%Y")
|
||||||
month = stamp.strftime("%m")
|
month = stamp.strftime("%m")
|
||||||
|
@ -184,26 +234,49 @@ def create_folder(file):
|
||||||
elif is_dir(file) is False:
|
elif is_dir(file) is False:
|
||||||
pass # this needs to turn into bailing out as there is a collision.
|
pass # this needs to turn into bailing out as there is a collision.
|
||||||
|
|
||||||
|
def copy_with_progress(s,d,f):
|
||||||
|
""" Copy a file with the progress bar """
|
||||||
|
size = os.path.getsize(s)
|
||||||
|
with open(s, 'rb') as fs:
|
||||||
|
with open(d, 'wb') as fd:
|
||||||
|
with tqdm(total=size, unit='B', unit_scale=True, desc=f'Copying {f}') as pbar:
|
||||||
|
while True:
|
||||||
|
chunk = fs.read(4096)
|
||||||
|
if not chunk:
|
||||||
|
break
|
||||||
|
fd.write(chunk)
|
||||||
|
pbar.update(len(chunk))
|
||||||
|
|
||||||
def copy_from_source(source_path,dest_path,file_name):
|
def copy_from_source(source_path,dest_path,file_name):
|
||||||
""" Copy file from source to destination """
|
""" Copy file from source to destination """
|
||||||
|
|
||||||
file_exists = path_exists(os.path.join(dest_path,file_name))
|
file_exists = path_exists(os.path.join(dest_path,file_name))
|
||||||
|
|
||||||
if file_exists is True:
|
if file_exists is True:
|
||||||
|
print(f'\nFound {file_name} at destination, checking if they match.')
|
||||||
check_match = cmp_files(os.path.join(source_path,file_name),
|
check_match = cmp_files(os.path.join(source_path,file_name),
|
||||||
os.path.join(dest_path, file_name))
|
os.path.join(dest_path, file_name))
|
||||||
if check_match is False:
|
if check_match is False:
|
||||||
print(f'Found duplicate for {source_path}, renaming destination with md5 appended.')
|
print(f'\nFound duplicate for {source_path}/{file_name}, \
|
||||||
|
renaming destination with hash appended.')
|
||||||
base, extension = os.path.splitext(file_name)
|
base, extension = os.path.splitext(file_name)
|
||||||
md5 = md5_hash(os.path.join(dest_path, file_name))
|
#md5 = md5_hash(os.path.join(dest_path, file_name))
|
||||||
file_name_hash = base + '_' + md5 + extension
|
f_xxhash = xx_hash(os.path.join(dest_path, file_name))
|
||||||
|
#file_name_hash = base + '_' + md5 + extension
|
||||||
|
file_name_hash = base + '_' + f_xxhash + extension
|
||||||
os.rename(os.path.join(dest_path, file_name),
|
os.rename(os.path.join(dest_path, file_name),
|
||||||
os.path.join(dest_path, file_name_hash))
|
os.path.join(dest_path, file_name_hash))
|
||||||
else:
|
else:
|
||||||
|
print(f'\n{file_name} hashes match')
|
||||||
return
|
return
|
||||||
|
|
||||||
create_folder(dest_path)
|
create_folder(dest_path)
|
||||||
shutil.copy(os.path.join(source_path,file_name), dest_path)
|
#shutil.copy(os.path.join(source_path,file_name), dest_path)
|
||||||
|
copy_with_progress(os.path.join(source_path,file_name),
|
||||||
|
os.path.join(dest_path,file_name),
|
||||||
|
file_name)
|
||||||
|
|
||||||
|
os.system('clear')
|
||||||
|
|
||||||
def process_file(path, f_type, f_name, ext):
|
def process_file(path, f_type, f_name, ext):
|
||||||
""" gather information and add to dictionary """
|
""" gather information and add to dictionary """
|
||||||
|
@ -269,13 +342,18 @@ def process_file(path, f_type, f_name, ext):
|
||||||
|
|
||||||
def find_files(directory):
|
def find_files(directory):
|
||||||
""" find files to build a dictionary out of """
|
""" find files to build a dictionary out of """
|
||||||
|
os.system('clear')
|
||||||
for folder, subfolders, filename in os.walk(directory):
|
for folder, subfolders, filename in os.walk(directory):
|
||||||
for f_type in config['file_types']:
|
for f_type in config['file_types']:
|
||||||
for ext in config['file_types'][f_type]:
|
for ext in config['file_types'][f_type]:
|
||||||
for file in tqdm(filename, desc = 'Finding ' + ext + ' Files', ncols = 100):
|
for file in tqdm(filename,
|
||||||
|
desc = 'Finding ' + ext + ' Files in ' + folder):
|
||||||
if file.lower().endswith(ext):
|
if file.lower().endswith(ext):
|
||||||
# print(file)
|
current_file = os.path.join(folder,file)
|
||||||
process_file(folder, f_type, file, ext)
|
if is_file(current_file):
|
||||||
|
process_file(folder, f_type, file, ext)
|
||||||
|
else:
|
||||||
|
print(f"Skipping {current_file} as it does not look like a real file.")
|
||||||
|
|
||||||
def validate_config_dir_access():
|
def validate_config_dir_access():
|
||||||
""" Validate we can operate in the defined directories """
|
""" Validate we can operate in the defined directories """
|
||||||
|
@ -299,7 +377,8 @@ def validate_config_dir_access():
|
||||||
|
|
||||||
def copy_files():
|
def copy_files():
|
||||||
""" Copy Files. """
|
""" Copy Files. """
|
||||||
for file in tqdm(files, desc = "Copying Files:", ncols = 100):
|
os.system('clear')
|
||||||
|
for file in tqdm(files, desc = "Copying Files:"):
|
||||||
create_folder(files[file]['folders']['destination'])
|
create_folder(files[file]['folders']['destination'])
|
||||||
|
|
||||||
copy_from_source(files[file]['folders']['source_path'],
|
copy_from_source(files[file]['folders']['source_path'],
|
||||||
|
@ -314,6 +393,7 @@ def copy_files():
|
||||||
files[file]['folders']['destination_original'],
|
files[file]['folders']['destination_original'],
|
||||||
files[file]['name'])
|
files[file]['name'])
|
||||||
|
|
||||||
|
'''
|
||||||
def gen_hashes():
|
def gen_hashes():
|
||||||
""" Generate Hashes """
|
""" Generate Hashes """
|
||||||
for file in tqdm(files, desc = "Generating MD5 Hashes:", ncols = 100):
|
for file in tqdm(files, desc = "Generating MD5 Hashes:", ncols = 100):
|
||||||
|
@ -322,7 +402,20 @@ def gen_hashes():
|
||||||
for folder in files[file]['folders']:
|
for folder in files[file]['folders']:
|
||||||
k = os.path.join(files[file]['folders'][folder], files[file]['name'])
|
k = os.path.join(files[file]['folders'][folder], files[file]['name'])
|
||||||
files[file]['md5_checksums'][k] = md5_hash(k)
|
files[file]['md5_checksums'][k] = md5_hash(k)
|
||||||
|
'''
|
||||||
|
|
||||||
|
def gen_xxhashes():
|
||||||
|
""" Generate xxHashes """
|
||||||
|
os.system('clear')
|
||||||
|
for file in tqdm(files, desc = "Generating xx Hashes:"):
|
||||||
|
#print(files[file])
|
||||||
|
files[file]['xx_checksums'] = {}
|
||||||
|
for folder in files[file]['folders']:
|
||||||
|
k = os.path.join(files[file]['folders'][folder], files[file]['name'])
|
||||||
|
files[file]['xx_checksums'][k] = xx_hash(k)
|
||||||
|
print(f"{k}: {files[file]['xx_checksums'][k]}")
|
||||||
|
|
||||||
|
'''
|
||||||
def validate_checksums():
|
def validate_checksums():
|
||||||
""" Validate Checksums """
|
""" Validate Checksums """
|
||||||
for file in tqdm(files, desc = "Verifying Checksums:", ncols = 100):
|
for file in tqdm(files, desc = "Verifying Checksums:", ncols = 100):
|
||||||
|
@ -341,11 +434,33 @@ def validate_checksums():
|
||||||
print('\n File Meta:\n')
|
print('\n File Meta:\n')
|
||||||
pprint(files[file])
|
pprint(files[file])
|
||||||
i = i + 1
|
i = i + 1
|
||||||
|
'''
|
||||||
|
|
||||||
|
def validate_xx_checksums():
|
||||||
|
""" Validate Checksums """
|
||||||
|
os.system('clear')
|
||||||
|
for file in tqdm(files, desc = "Verifying Checksums:"):
|
||||||
|
i = 0
|
||||||
|
c = {}
|
||||||
|
for checksum in files[file]['xx_checksums']:
|
||||||
|
c[i] = files[file]['xx_checksums'][checksum]
|
||||||
|
if i > 0:
|
||||||
|
p = i - 1
|
||||||
|
if c[i] == c[p]:
|
||||||
|
files[file]['source_cleanable'] = True
|
||||||
|
else:
|
||||||
|
files[file]['source_cleanable'] = False
|
||||||
|
print(f'FATAL: Checksum validation failed for: \
|
||||||
|
{files[file]["name"]} \n{c[i]}\n is not equal to \n{c[p]}\n')
|
||||||
|
print('\n File Meta:\n')
|
||||||
|
pprint(files[file])
|
||||||
|
i = i + 1
|
||||||
|
|
||||||
def cleanup_sd():
|
def cleanup_sd():
|
||||||
""" If we should clean up the SD, nuke the copied files. """
|
""" If we should clean up the SD, nuke the copied files. """
|
||||||
if config['cleanup_sd'] is True:
|
if config['cleanup_sd'] is True:
|
||||||
for file in tqdm(files, desc = "Cleaning Up SD:", ncols = 100):
|
os.system('clear')
|
||||||
|
for file in tqdm(files, desc = "Cleaning Up SD:"):
|
||||||
if files[file]['source_cleanable'] is True:
|
if files[file]['source_cleanable'] is True:
|
||||||
os.remove(os.path.join(files[file]['folders']['source_path'],files[file]['name']))
|
os.remove(os.path.join(files[file]['folders']['source_path'],files[file]['name']))
|
||||||
|
|
||||||
|
@ -353,8 +468,8 @@ GO = validate_config_dir_access()
|
||||||
if GO is True:
|
if GO is True:
|
||||||
find_files(config['folders']['source']['base'])
|
find_files(config['folders']['source']['base'])
|
||||||
copy_files()
|
copy_files()
|
||||||
gen_hashes()
|
gen_xxhashes()
|
||||||
validate_checksums()
|
validate_xx_checksums()
|
||||||
cleanup_sd()
|
cleanup_sd()
|
||||||
else:
|
else:
|
||||||
print("There was a problem accessing one or more directories defined in the configuration.")
|
print("There was a problem accessing one or more directories defined in the configuration.")
|
||||||
|
|
Loading…
Reference in New Issue