Loading src/cso/cso_colhub.py +165 −14 Original line number Diff line number Diff line Loading @@ -24,6 +24,9 @@ # 2025-04, Arjo Segers # Changed imports for python packaging. # # 2025-09, Arjo Segers # Added "CSO_ColHubMirror_Cleanup" class. # ######################################################################## ### Loading Loading @@ -515,6 +518,154 @@ class CSO_ColHubMirror_Missing(utopya.UtopyaRc): # endclass CSO_ColHubMirror_Missing ######################################################################## ### ### cleanup mirror extension ### ######################################################################## class CSO_ColHubMirror_Cleanup(utopya.UtopyaRc): """ Cleanup extension to mirror: * remove files that are (now) also available in mirror In the settings, define the listing files of the mirror and the extension, eventually replace time templates in filename by a specific date (default: today): <rcbase>.mirror.file : /work/inquire/mirror.csv !<rcbase>.mirror.filedate : 2025-01-24 <rcbase>.mirror2.file : /work/inquire/mirror2.csv !<rcbase>.mirror2.filedate : 2025-01-24 If files in the extension are also present in the main mirror archive, then remove these records from the listing. The corresponding files are renamed to an extension ``.bak``, or removed if the following flag is set:: ! remove duplicates? otherwise rename to '.bak': <rcbase>.mirror2.remove_duplicates : False Optionally define a creation mode for the (parent) directories:: ! directory creation mode: <rcbase>.dmode : 0o775 """ def __init__(self, rcfile, rcbase="", env={}, indent=""): """ Convert data. """ # modules: import os import datetime # import glob import collections import fnmatch # tools: from . import cso_file # info ... logging.info(f"{indent}") logging.info(f"{indent}** cleanup mirror extension") logging.info(f"{indent}") # init base object: utopya.UtopyaRc.__init__(self, rcfile=rcfile, rcbase=rcbase, env=env) # table files in main archive: listfile1= self.GetSetting("mirror.file") # evaluate time? filedate1 = self.GetSetting( "mirror1.filedate", totype="datetime", default=datetime.datetime.now() ) listfile1 = filedate1.strftime(listfile1) # read: listing1 = cso_file.CSO_Listing(listfile1) # table files in extra archive: listfile2= self.GetSetting("mirror2.file") # evaluate time? filedate2 = self.GetSetting( "mirror2.filedate", totype="datetime", default=datetime.datetime.now() ) listfile2 = filedate2.strftime(listfile2) # read: listing2 = cso_file.CSO_Listing(listfile2) # remove duplicate files? if not, rename: remove_duplicates = self.GetSetting("mirror2.remove_duplicates", totype="bool") # info ... logging.info(f"{indent}check on duplicate records in extension ...") # list of records to be removed: labels = [] # loop over extra files: for fname2, row2 in listing2.df.iterrows(): # check if also in main archive: if fname2 in listing1.df.index: # info ... logging.info(f"{indent}file '{fname2}' already in main archive ...") # actual file: href = row2["href"] # check .. if not os.path.isfile(href): logging.error(f"file not found at expected location: {href}") raise Exception #endif # what to do? if remove_duplicates: # info .. logging.info(f"{indent} remove: {href}") # remove: os.remove( href ) else: # info .. logging.info(f"{indent} rename: {href}") # rename: os.rename( href, href+".bak" ) #endif # store label for removal from listing: labels.append( fname2 ) #endif #endfor # remove files from listing? if len(labels) == 0: # info .. logging.info(f"{indent} no duplicates found ...") else: # info .. logging.info(f"{indent}remove {len(labels)} records from listing ...") # remove records: listing2.df.drop( labels, inplace=True ) # directory creation mode: dmode = self.GetSetting("dmode", totype="int", default=None) # save: listing2.Save(listfile2, dmode=dmode, indent=f"{indent} ") #endif # info ... logging.info(f"{indent}") logging.info(f"{indent}** end cleanup") logging.info(f"{indent}") # enddef __init__ # endclass CSO_ColHubMirror_Cleanup ######################################################################## ### ### end Loading Loading
src/cso/cso_colhub.py +165 −14 Original line number Diff line number Diff line Loading @@ -24,6 +24,9 @@ # 2025-04, Arjo Segers # Changed imports for python packaging. # # 2025-09, Arjo Segers # Added "CSO_ColHubMirror_Cleanup" class. # ######################################################################## ### Loading Loading @@ -515,6 +518,154 @@ class CSO_ColHubMirror_Missing(utopya.UtopyaRc): # endclass CSO_ColHubMirror_Missing ######################################################################## ### ### cleanup mirror extension ### ######################################################################## class CSO_ColHubMirror_Cleanup(utopya.UtopyaRc): """ Cleanup extension to mirror: * remove files that are (now) also available in mirror In the settings, define the listing files of the mirror and the extension, eventually replace time templates in filename by a specific date (default: today): <rcbase>.mirror.file : /work/inquire/mirror.csv !<rcbase>.mirror.filedate : 2025-01-24 <rcbase>.mirror2.file : /work/inquire/mirror2.csv !<rcbase>.mirror2.filedate : 2025-01-24 If files in the extension are also present in the main mirror archive, then remove these records from the listing. The corresponding files are renamed to an extension ``.bak``, or removed if the following flag is set:: ! remove duplicates? otherwise rename to '.bak': <rcbase>.mirror2.remove_duplicates : False Optionally define a creation mode for the (parent) directories:: ! directory creation mode: <rcbase>.dmode : 0o775 """ def __init__(self, rcfile, rcbase="", env={}, indent=""): """ Convert data. """ # modules: import os import datetime # import glob import collections import fnmatch # tools: from . import cso_file # info ... logging.info(f"{indent}") logging.info(f"{indent}** cleanup mirror extension") logging.info(f"{indent}") # init base object: utopya.UtopyaRc.__init__(self, rcfile=rcfile, rcbase=rcbase, env=env) # table files in main archive: listfile1= self.GetSetting("mirror.file") # evaluate time? filedate1 = self.GetSetting( "mirror1.filedate", totype="datetime", default=datetime.datetime.now() ) listfile1 = filedate1.strftime(listfile1) # read: listing1 = cso_file.CSO_Listing(listfile1) # table files in extra archive: listfile2= self.GetSetting("mirror2.file") # evaluate time? filedate2 = self.GetSetting( "mirror2.filedate", totype="datetime", default=datetime.datetime.now() ) listfile2 = filedate2.strftime(listfile2) # read: listing2 = cso_file.CSO_Listing(listfile2) # remove duplicate files? if not, rename: remove_duplicates = self.GetSetting("mirror2.remove_duplicates", totype="bool") # info ... logging.info(f"{indent}check on duplicate records in extension ...") # list of records to be removed: labels = [] # loop over extra files: for fname2, row2 in listing2.df.iterrows(): # check if also in main archive: if fname2 in listing1.df.index: # info ... logging.info(f"{indent}file '{fname2}' already in main archive ...") # actual file: href = row2["href"] # check .. if not os.path.isfile(href): logging.error(f"file not found at expected location: {href}") raise Exception #endif # what to do? if remove_duplicates: # info .. logging.info(f"{indent} remove: {href}") # remove: os.remove( href ) else: # info .. logging.info(f"{indent} rename: {href}") # rename: os.rename( href, href+".bak" ) #endif # store label for removal from listing: labels.append( fname2 ) #endif #endfor # remove files from listing? if len(labels) == 0: # info .. logging.info(f"{indent} no duplicates found ...") else: # info .. logging.info(f"{indent}remove {len(labels)} records from listing ...") # remove records: listing2.df.drop( labels, inplace=True ) # directory creation mode: dmode = self.GetSetting("dmode", totype="int", default=None) # save: listing2.Save(listfile2, dmode=dmode, indent=f"{indent} ") #endif # info ... logging.info(f"{indent}") logging.info(f"{indent}** end cleanup") logging.info(f"{indent}") # enddef __init__ # endclass CSO_ColHubMirror_Cleanup ######################################################################## ### ### end Loading