In [2]:
import os, shutil, magic

STEP 1: Rename all files

STEP 2: Once renamed, sort into respective folders

(A) First, create dictionary of all extensions in folder tree

In [ ]:
import os, shutil, magic

# =============================
# USING DIY EXTENSION-GETTING
# =============================

# directory_in_str = '/Volumes/EHD_5/____play_all/'

# count = 0
# diy_ext = {}
# for subdir, dirs, files in os.walk(directory_in_str):
#     for file in files:
#         count += 1
#         ext = file.split('.')[-1]
#         if ext in diy_ext:
#             diy_ext[ext] += 1
#         else:
#             diy_ext[ext] = 1

# print(count)         

# =============================
# USING FILE MAGIC
# =============================

directory_in_str = '/Volumes/EHD_5/____play_all/'

count = 0
diy_ext = {}
for subdir, dirs, files in os.walk(directory_in_str):
    for file in files:
        filepath = subdir + os.sep + file
        count += 1
        try:
            ext = magic.from_file(filepath, mime=True)
            if ext in diy_ext:
                diy_ext[ext] += 1
            else:
                diy_ext[ext] = 1
        except:
            print('no magic', file)

print('dict length', len(diy_ext))            
print(count)     

(B) Import already "labeled" filetypes

In [ ]:
import pandas as pd

# =============================
# DIY EXTENSION-GETTING LABELS
# =============================
ext = pd.read_csv('EHD_labeled_extensions_and_filetypes.csv')
ext_dict = dict(zip(ext['0'], ext['1']))
ext_dict

# =============================
# FILE MAGIC LABELS
# =============================
magic_ext = pd.read_csv('EHD_magic_labeled_extensions_and_filetypes.csv')
magic_dict = dict(zip(magic_ext['0'], magic_ext['1']))

(C) Compare that extension dictionary with the one we have already labeled

In [ ]:
new_dict = {}
for (k,v) in diy_ext.items():
    if k not in magic_dict:
        new_dict[k] = v
        print(k,v)

(D) Update the filetypes we don't already have, with shorthand

In [ ]:
additional_filetypes = {}        
for (k,v) in new_dict.items():
    print(k, v)
    new_val=input('parent')
    additional_filetypes[k] = new_val

(E) Convert the shorthand

In [ ]:
new_dict = additional_filetypes
for (k,v) in new_dict.items():
    if v == 'doc' or v == 'docs' or v == 'dof':
        new_dict[k] = '_____DOCUMENTS'
    if v == 'dev':
        new_dict[k] = '_____DEVELOPER'
    if v == 'no' or v == 'unknown' or v == 'app':
        new_dict[k] = '_____OTHER_UNKNOWN'
    if v == 'zip':
        new_dict[k] = '_____ZIP'
    if v == 'image' or v == 'photos' or v == 'photo':
        new_dict[k] = '_____PHOTOS'
    if v == 'media':
        new_dict[k] = '_____MEDIA'
    if v == 'photoshop':
        new_dict[k] = '_____PHOTOSHOP'

new_dict   

(F) Merge the original labeled dictionary with the newly labeled dictionary

In [ ]:
def merge_two_dicts(x, y):
    z = x.copy()   # start with x's keys and values
    z.update(y)    # modifies z with y's keys and values & returns None
    return z

mega_dict = merge_two_dicts(magic_dict, new_dict)
len(mega_dict)
In [5]:
# JUST FOR play_all
import pandas as pd
magic_ext = pd.read_csv('EHD_magic_labeled_extensions_and_filetypes_v2.csv')
mega_dict = dict(zip(magic_ext['0'], magic_ext['1']))
mega_dict
/Users/kendraryan/.pyenv/versions/3.7.3/lib/python3.7/site-packages/pandas/compat/__init__.py:117: UserWarning: Could not import the lzma module. Your installed Python is incomplete. Attempting to use lzma compression will result in a RuntimeError.
  warnings.warn(msg)
Out[5]:
{'application/CDFV2': '_____OTHER_UNKNOWN',
 'application/csv': '_____DOCUMENTS',
 'application/epub+zip': '_____ZIP',
 'application/gzip': '_____ZIP',
 'application/java-archive': '_____OTHER_UNKNOWN',
 'application/javascript': '_____DEVELOPER',
 'application/json': '_____DEVELOPER',
 'application/mac-binhex40': '_____OTHER_UNKNOWN',
 'application/msword': '_____DOCUMENTS',
 'application/mxf': '_____OTHER_UNKNOWN',
 'application/octet-stream': '_____OTHER_UNKNOWN',
 'application/pdf': '_____DOCUMENTS',
 'application/pgp-signature': '_____OTHER_UNKNOWN',
 'application/vnd.iccprofile': '_____OTHER_UNKNOWN',
 'application/vnd.ms-excel': '_____DOCUMENTS',
 'application/vnd.ms-fontobject': '_____DOCUMENTS',
 'application/vnd.ms-opentype': '_____DOCUMENTS',
 'application/vnd.ms-powerpoint': '_____DOCUMENTS',
 'application/vnd.openxmlformats-officedocument.presentationml.presentation': '_____DOCUMENTS',
 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': '_____DOCUMENTS',
 'application/vnd.openxmlformats-officedocument.wordprocessingml.document': '_____DOCUMENTS',
 'application/vnd.sketchup.skp': '_____PHOTOSHOP',
 'application/x-7z-compressed': '_____ZIP',
 'application/x-archive': '_____OTHER_UNKNOWN',
 'application/x-dmp': '_____OTHER_UNKNOWN',
 'application/x-dosexec': '_____OTHER_UNKNOWN',
 'application/x-executable': '_____OTHER_UNKNOWN',
 'application/x-font-sfn': '_____OTHER_UNKNOWN',
 'application/x-fpt': '_____OTHER_UNKNOWN',
 'application/x-gettext-translation': '_____OTHER_UNKNOWN',
 'application/x-git': '_____ZIP',
 'application/x-mach-binary': '_____OTHER_UNKNOWN',
 'application/x-rpm': '_____OTHER_UNKNOWN',
 'application/x-sharedlib': '_____OTHER_UNKNOWN',
 'application/x-shockwave-flash': '_____OTHER_UNKNOWN',
 'application/x-sqlite3': '_____DEVELOPER',
 'application/x-tar': '_____ZIP',
 'application/x-wine-extension-ini': '_____OTHER_UNKNOWN',
 'application/x-xar': '_____OTHER_UNKNOWN',
 'application/x-xz': '_____OTHER_UNKNOWN',
 'application/zip': '_____ZIP',
 'application/zlib': '_____ZIP',
 'audio/amr': '_____PHOTOS',
 'audio/mpeg': '_____PHOTOS',
 'audio/x-aiff': '_____MEDIA',
 'audio/x-hx-aac-adts': '_____OTHER_UNKNOWN',
 'audio/x-m4a': '_____MEDIA',
 'audio/x-mp4a-latm': '_____MEDIA',
 'audio/x-wav': '_____MEDIA',
 'font/sfnt': '_____DOCUMENTS',
 'font/ttf': '_____PHOTOSHOP',
 'image/bmp': '_____PHOTOS',
 'image/gif': '_____PHOTOS',
 'image/heic': '_____PHOTOS',
 'image/jpeg': '_____PHOTOS',
 'image/png': '_____PHOTOS',
 'image/svg+xml': '_____PHOTOS',
 'image/tiff': '_____PHOTOS',
 'image/vnd.adobe.photoshop': '_____PHOTOSHOP',
 'image/vnd.dwg': '_____PHOTOS',
 'image/vnd.microsoft.icon': '_____OTHER_UNKNOWN',
 'image/webp': '_____PHOTOS',
 'image/x-canon-cr2': '_____PHOTOS',
 'image/x-eps': '_____PHOTOS',
 'image/x-exr': '_____PHOTOS',
 'image/x-icns': '_____OTHER_UNKNOWN',
 'image/x-mvg': '_____PHOTOS',
 'image/x-portable-pixmap': '_____PHOTOS',
 'image/x-tga': '_____PHOTOS',
 'inode/x-empty': '_____OTHER_UNKNOWN',
 'message/rfc822': '_____OTHER_UNKNOWN',
 'text/calendar': '_____MEDIA',
 'text/html': '_____DEVELOPER',
 'text/plain': '_____DOCUMENTS',
 'text/rtf': '_____DOCUMENTS',
 'text/troff': '_____DOCUMENTS',
 'text/vcard': '_____DOCUMENTS',
 'text/x-Algol68': '_____OTHER_UNKNOWN',
 'text/x-asm': '_____OTHER_UNKNOWN',
 'text/x-bytecode.python': '_____DEVELOPER',
 'text/x-c': '_____OTHER_UNKNOWN',
 'text/x-c++': '_____OTHER_UNKNOWN',
 'text/x-clojure': '_____OTHER_UNKNOWN',
 'text/x-diff': '_____OTHER_UNKNOWN',
 'text/x-java': '_____DEVELOPER',
 'text/x-lisp': '_____DEVELOPER',
 'text/x-makefile': '_____OTHER_UNKNOWN',
 'text/x-msdos-batch': '_____OTHER_UNKNOWN',
 'text/x-objective-c': '_____OTHER_UNKNOWN',
 'text/x-pascal': '_____OTHER_UNKNOWN',
 'text/x-perl': '_____OTHER_UNKNOWN',
 'text/x-php': '_____DEVELOPER',
 'text/x-po': '_____OTHER_UNKNOWN',
 'text/x-ruby': '_____DEVELOPER',
 'text/x-script.python': '_____DEVELOPER',
 'text/x-shellscript': '_____DEVELOPER',
 'text/x-tex': '_____OTHER_UNKNOWN',
 'text/xml': '_____DOCUMENTS',
 'video/3gpp': '_____OTHER_UNKNOWN',
 'video/mp4': '_____PHOTOS',
 'video/mpeg': '_____PHOTOS',
 'video/quicktime': '_____PHOTOS',
 'video/x-m4v': '_____PHOTOS',
 'video/x-msvideo': '_____PHOTOS'}
In [1]:
directory_in_str = '/Volumes/EHD_5/____play_all/'
destination = '/Volumes/EHD_5/'

for subdir, dirs, files in os.walk(directory_in_str):
    for file in files:
        filepath = subdir + os.sep + file
        try:
            file_extension = magic.from_file(filepath, mime=True)
            if file_extension in mega_dict:
                parent = mega_dict[file_extension]
                newpath = destination + parent + '/' + file
#                 print(newpath)
                shutil.move(filepath, newpath)
            else:
                print(file_extension)
        except:
            print('-------------------', file)
In [9]:
directory_in_str = '/Volumes/EHD_5/____play_all/'
destination = '/Volumes/EHD_5/'

for subdir, dirs, files in os.walk(directory_in_str):
    for file in files:
        filepath = subdir + os.sep + file
        try:
            file_extension = magic.from_file(filepath, mime=True)
            if file_extension in mega_dict:
                parent = mega_dict[file_extension]
                newpath = destination + parent + '/' + file
#                 print(newpath)
                shutil.move(filepath, newpath)
            else:
                print(file_extension)
        except:
            print('-------------------', file)
In [ ]: