Loading src/cso/cso_colhub.py +165 −14 Original line number Diff line number Diff line Loading @@ -24,6 +24,9 @@ # 2025-04, Arjo Segers # Changed imports for python packaging. # # 2025-09, Arjo Segers # Added "CSO_ColHubMirror_Cleanup" class. # ######################################################################## ### Loading Loading @@ -515,6 +518,154 @@ class CSO_ColHubMirror_Missing(utopya.UtopyaRc): # endclass CSO_ColHubMirror_Missing ######################################################################## ### ### cleanup mirror extension ### ######################################################################## class CSO_ColHubMirror_Cleanup(utopya.UtopyaRc): """ Cleanup extension to mirror: * remove files that are (now) also available in mirror In the settings, define the listing files of the mirror and the extension, eventually replace time templates in filename by a specific date (default: today): <rcbase>.mirror.file : /work/inquire/mirror.csv !<rcbase>.mirror.filedate : 2025-01-24 <rcbase>.mirror2.file : /work/inquire/mirror2.csv !<rcbase>.mirror2.filedate : 2025-01-24 If files in the extension are also present in the main mirror archive, then remove these records from the listing. The corresponding files are renamed to an extension ``.bak``, or removed if the following flag is set:: ! remove duplicates? otherwise rename to '.bak': <rcbase>.mirror2.remove_duplicates : False Optionally define a creation mode for the (parent) directories:: ! directory creation mode: <rcbase>.dmode : 0o775 """ def __init__(self, rcfile, rcbase="", env={}, indent=""): """ Convert data. """ # modules: import os import datetime # import glob import collections import fnmatch # tools: from . import cso_file # info ... logging.info(f"{indent}") logging.info(f"{indent}** cleanup mirror extension") logging.info(f"{indent}") # init base object: utopya.UtopyaRc.__init__(self, rcfile=rcfile, rcbase=rcbase, env=env) # table files in main archive: listfile1= self.GetSetting("mirror.file") # evaluate time? filedate1 = self.GetSetting( "mirror1.filedate", totype="datetime", default=datetime.datetime.now() ) listfile1 = filedate1.strftime(listfile1) # read: listing1 = cso_file.CSO_Listing(listfile1) # table files in extra archive: listfile2= self.GetSetting("mirror2.file") # evaluate time? filedate2 = self.GetSetting( "mirror2.filedate", totype="datetime", default=datetime.datetime.now() ) listfile2 = filedate2.strftime(listfile2) # read: listing2 = cso_file.CSO_Listing(listfile2) # remove duplicate files? if not, rename: remove_duplicates = self.GetSetting("mirror2.remove_duplicates", totype="bool") # info ... logging.info(f"{indent}check on duplicate records in extension ...") # list of records to be removed: labels = [] # loop over extra files: for fname2, row2 in listing2.df.iterrows(): # check if also in main archive: if fname2 in listing1.df.index: # info ... logging.info(f"{indent}file '{fname2}' already in main archive ...") # actual file: href = row2["href"] # check .. if not os.path.isfile(href): logging.error(f"file not found at expected location: {href}") raise Exception #endif # what to do? if remove_duplicates: # info .. logging.info(f"{indent} remove: {href}") # remove: os.remove( href ) else: # info .. logging.info(f"{indent} rename: {href}") # rename: os.rename( href, href+".bak" ) #endif # store label for removal from listing: labels.append( fname2 ) #endif #endfor # remove files from listing? if len(labels) == 0: # info .. logging.info(f"{indent} no duplicates found ...") else: # info .. logging.info(f"{indent}remove {len(labels)} records from listing ...") # remove records: listing2.df.drop( labels, inplace=True ) # directory creation mode: dmode = self.GetSetting("dmode", totype="int", default=None) # save: listing2.Save(listfile2, dmode=dmode, indent=f"{indent} ") #endif # info ... logging.info(f"{indent}") logging.info(f"{indent}** end cleanup") logging.info(f"{indent}") # enddef __init__ # endclass CSO_ColHubMirror_Cleanup ######################################################################## ### ### end Loading src/cso/cso_dataspace.py +29 −24 Original line number Diff line number Diff line Loading @@ -43,6 +43,9 @@ # 2025-04, Arjo Segers # Changed imports for python packaging. # # 2025-10, Arjo Segers # Use mimetype of downloaded product to decide on postprocessing. # ######################################################################## Loading Loading @@ -775,6 +778,7 @@ class CSO_DataSpace_Downloader(object): import os import time import requests import magic import zipfile import shutil Loading Loading @@ -822,9 +826,25 @@ class CSO_DataSpace_Downloader(object): # rename: os.rename(tmpfile, product_file) # try to open product file; # first try if it is a zipfile: try: # file type: mimetype = magic.from_file( product_file, mime=True ) # switch: #~ nc file: if mimetype == "application/x-hdf": # info .. logging.info(f"{indent}product is netcdf file, store ...") # this is the target netcdf file already; # create target dir if necessary: cso_file.CheckDir(output_file, dmode=dmode) # rename to destination: shutil.move(product_file, output_file) #~ zip file: elif mimetype == "application/zip": # info .. logging.info(f"{indent}product is zip file, unpack ...") # open as zipfile: arch = zipfile.ZipFile(product_file, mode="r") # loop over members, probably two files in a directory: Loading Loading @@ -859,27 +879,12 @@ class CSO_DataSpace_Downloader(object): logging.info(f"{indent}remove product file ...") # remove package: os.remove(product_file) # except Exception as err: # info .. msg = str(err) # logging.error("from download; message received:") # logging.error(" %s" % msg) # catch known problem ... if "File is not a zip file" in msg: # logging.warning(f"{indent}maybe download was interrupted, try again ...") # info .. logging.info(f"{indent}product is no zipfile, rename to output file ...") # this is probably the target file already; # create target dir if necessary: cso_file.CheckDir(output_file, dmode=dmode) # rename to destination: shutil.move(product_file, output_file) #~ unknown ... else: # quit with error: raise logger.error( f"unsupported mimetype '{mimetype}'" ) raise Exception #endif # endtry # all ok, leave retry loop: break Loading src/cso/cso_s5p.py +2 −2 Original line number Diff line number Diff line Loading @@ -2879,8 +2879,8 @@ class CSO_S5p_Convert(utopya.UtopyaRc): if len(odf) == 0: continue elif len(odf) > 1: logging.error(f"found {len(odf)} records matching selection;" + " use finer selection, or something wrong in inquiry table?" ) logging.error(f"found multiple records matching selection;" + " use finer selection, or something wrong in inquiry table(s)?" ) raise Exception #endif # selected record: Loading src/cso/cso_file.py +1 −1 File changed.Contains only whitespace changes. Show changes Loading
src/cso/cso_colhub.py +165 −14 Original line number Diff line number Diff line Loading @@ -24,6 +24,9 @@ # 2025-04, Arjo Segers # Changed imports for python packaging. # # 2025-09, Arjo Segers # Added "CSO_ColHubMirror_Cleanup" class. # ######################################################################## ### Loading Loading @@ -515,6 +518,154 @@ class CSO_ColHubMirror_Missing(utopya.UtopyaRc): # endclass CSO_ColHubMirror_Missing ######################################################################## ### ### cleanup mirror extension ### ######################################################################## class CSO_ColHubMirror_Cleanup(utopya.UtopyaRc): """ Cleanup extension to mirror: * remove files that are (now) also available in mirror In the settings, define the listing files of the mirror and the extension, eventually replace time templates in filename by a specific date (default: today): <rcbase>.mirror.file : /work/inquire/mirror.csv !<rcbase>.mirror.filedate : 2025-01-24 <rcbase>.mirror2.file : /work/inquire/mirror2.csv !<rcbase>.mirror2.filedate : 2025-01-24 If files in the extension are also present in the main mirror archive, then remove these records from the listing. The corresponding files are renamed to an extension ``.bak``, or removed if the following flag is set:: ! remove duplicates? otherwise rename to '.bak': <rcbase>.mirror2.remove_duplicates : False Optionally define a creation mode for the (parent) directories:: ! directory creation mode: <rcbase>.dmode : 0o775 """ def __init__(self, rcfile, rcbase="", env={}, indent=""): """ Convert data. """ # modules: import os import datetime # import glob import collections import fnmatch # tools: from . import cso_file # info ... logging.info(f"{indent}") logging.info(f"{indent}** cleanup mirror extension") logging.info(f"{indent}") # init base object: utopya.UtopyaRc.__init__(self, rcfile=rcfile, rcbase=rcbase, env=env) # table files in main archive: listfile1= self.GetSetting("mirror.file") # evaluate time? filedate1 = self.GetSetting( "mirror1.filedate", totype="datetime", default=datetime.datetime.now() ) listfile1 = filedate1.strftime(listfile1) # read: listing1 = cso_file.CSO_Listing(listfile1) # table files in extra archive: listfile2= self.GetSetting("mirror2.file") # evaluate time? filedate2 = self.GetSetting( "mirror2.filedate", totype="datetime", default=datetime.datetime.now() ) listfile2 = filedate2.strftime(listfile2) # read: listing2 = cso_file.CSO_Listing(listfile2) # remove duplicate files? if not, rename: remove_duplicates = self.GetSetting("mirror2.remove_duplicates", totype="bool") # info ... logging.info(f"{indent}check on duplicate records in extension ...") # list of records to be removed: labels = [] # loop over extra files: for fname2, row2 in listing2.df.iterrows(): # check if also in main archive: if fname2 in listing1.df.index: # info ... logging.info(f"{indent}file '{fname2}' already in main archive ...") # actual file: href = row2["href"] # check .. if not os.path.isfile(href): logging.error(f"file not found at expected location: {href}") raise Exception #endif # what to do? if remove_duplicates: # info .. logging.info(f"{indent} remove: {href}") # remove: os.remove( href ) else: # info .. logging.info(f"{indent} rename: {href}") # rename: os.rename( href, href+".bak" ) #endif # store label for removal from listing: labels.append( fname2 ) #endif #endfor # remove files from listing? if len(labels) == 0: # info .. logging.info(f"{indent} no duplicates found ...") else: # info .. logging.info(f"{indent}remove {len(labels)} records from listing ...") # remove records: listing2.df.drop( labels, inplace=True ) # directory creation mode: dmode = self.GetSetting("dmode", totype="int", default=None) # save: listing2.Save(listfile2, dmode=dmode, indent=f"{indent} ") #endif # info ... logging.info(f"{indent}") logging.info(f"{indent}** end cleanup") logging.info(f"{indent}") # enddef __init__ # endclass CSO_ColHubMirror_Cleanup ######################################################################## ### ### end Loading
src/cso/cso_dataspace.py +29 −24 Original line number Diff line number Diff line Loading @@ -43,6 +43,9 @@ # 2025-04, Arjo Segers # Changed imports for python packaging. # # 2025-10, Arjo Segers # Use mimetype of downloaded product to decide on postprocessing. # ######################################################################## Loading Loading @@ -775,6 +778,7 @@ class CSO_DataSpace_Downloader(object): import os import time import requests import magic import zipfile import shutil Loading Loading @@ -822,9 +826,25 @@ class CSO_DataSpace_Downloader(object): # rename: os.rename(tmpfile, product_file) # try to open product file; # first try if it is a zipfile: try: # file type: mimetype = magic.from_file( product_file, mime=True ) # switch: #~ nc file: if mimetype == "application/x-hdf": # info .. logging.info(f"{indent}product is netcdf file, store ...") # this is the target netcdf file already; # create target dir if necessary: cso_file.CheckDir(output_file, dmode=dmode) # rename to destination: shutil.move(product_file, output_file) #~ zip file: elif mimetype == "application/zip": # info .. logging.info(f"{indent}product is zip file, unpack ...") # open as zipfile: arch = zipfile.ZipFile(product_file, mode="r") # loop over members, probably two files in a directory: Loading Loading @@ -859,27 +879,12 @@ class CSO_DataSpace_Downloader(object): logging.info(f"{indent}remove product file ...") # remove package: os.remove(product_file) # except Exception as err: # info .. msg = str(err) # logging.error("from download; message received:") # logging.error(" %s" % msg) # catch known problem ... if "File is not a zip file" in msg: # logging.warning(f"{indent}maybe download was interrupted, try again ...") # info .. logging.info(f"{indent}product is no zipfile, rename to output file ...") # this is probably the target file already; # create target dir if necessary: cso_file.CheckDir(output_file, dmode=dmode) # rename to destination: shutil.move(product_file, output_file) #~ unknown ... else: # quit with error: raise logger.error( f"unsupported mimetype '{mimetype}'" ) raise Exception #endif # endtry # all ok, leave retry loop: break Loading
src/cso/cso_s5p.py +2 −2 Original line number Diff line number Diff line Loading @@ -2879,8 +2879,8 @@ class CSO_S5p_Convert(utopya.UtopyaRc): if len(odf) == 0: continue elif len(odf) > 1: logging.error(f"found {len(odf)} records matching selection;" + " use finer selection, or something wrong in inquiry table?" ) logging.error(f"found multiple records matching selection;" + " use finer selection, or something wrong in inquiry table(s)?" ) raise Exception #endif # selected record: Loading