From 839e8472f777fa8316915881af97977ec003f476 Mon Sep 17 00:00:00 2001 From: Arjo Segers Date: Wed, 21 Jan 2026 13:34:58 +0100 Subject: [PATCH 1/3] Added class to cleanup extension of ColHub mirror. --- src/cso/cso_colhub.py | 179 ++++++++++++++++++++++++++++++++++++++---- 1 file changed, 165 insertions(+), 14 deletions(-) diff --git a/src/cso/cso_colhub.py b/src/cso/cso_colhub.py index 88056f9..1e0a4a4 100644 --- a/src/cso/cso_colhub.py +++ b/src/cso/cso_colhub.py @@ -24,6 +24,9 @@ # 2025-04, Arjo Segers # Changed imports for python packaging. # +# 2025-09, Arjo Segers +# Added "CSO_ColHubMirror_Cleanup" class. +# ######################################################################## ### @@ -38,16 +41,16 @@ ``cso_colhub`` module ********************* -The :py:mod:`cso.cso_colhub` module provides classes for accessing data from the +The :py:mod:`cso.cso_colhub` module provides classes for accessing data from the Norwegian `ColHub `_ archive. -This is a (partial) mirror of the `Copernicus Open Access Hub `_ +This is a (partial) mirror of the `Copernicus Open Access Hub `_ with all orbits covering Norway and surrounding areas. -The data can be accessed using a web-interface, +The data can be accessed using a web-interface, but for Met Norway users also directly from the storage. -This module provides the :py:class:`.CSO_ColHubMirror_Inquire` to inquire the storage +This module provides the :py:class:`.CSO_ColHubMirror_Inquire` to inquire the storage to see what is already available. -In addition, the :py:class:`.CSO_ColHubMirror_Missing` is available to create a table +In addition, the :py:class:`.CSO_ColHubMirror_Missing` is available to create a table that lists files that are in the DataSpace but not yet in the mirror; these could be used to make additional downloads. @@ -312,7 +315,7 @@ class CSO_ColHubMirror_Missing(utopya.UtopyaRc): """ Create *listing* file for files that are in one inquiry table but not in another one. This could be used to complete a mirror archive. - + The format is similar to the output of *inquiry* classes, with per line a filename, the time range of pixels in the file, and other information extracted from the filenames:: @@ -322,19 +325,19 @@ class CSO_ColHubMirror_Missing(utopya.UtopyaRc): In the settings, define the listing file with all available data, for example the result of an inquiry step; eventually add a timestamp to replace templates in the filename:: - + .all.file : /work/inquire/Copernicus_S5p_NO2_dataspace__%Y-%m-%d.csv !.all.filedate : 2025-01-24 - - Similar specify the name (or ";" seperated list of names of the file that is listing the current mirror(s), + + Similar specify the name (or ";" seperated list of names of the file that is listing the current mirror(s), probably the output of the :py:class:`CSO_ColHubMirror_Inquire` class:: .curr.file : /work/inquire/Copernicus_S5p_NO2_colhub-mirror__%Y-%m-%d.csv ; \\ /work/inquire/Copernicus_S5p_NO2_colhub-mirror2__%Y-%m-%d.csv !.curr.filedate : 2025-01-24 - + Specify a selection filter; this defines which of the orbit files are actually needed:: - + ! Provide ';' seperated list of to decide if a particular orbit file should be processed. ! If more than one file is available for a particular orbit (from "OFFL" and "RPRO" processing), ! the file with the first match will be used. @@ -343,19 +346,19 @@ class CSO_ColHubMirror_Missing(utopya.UtopyaRc): .selection : (%{collection} == '03') and (%{processing} == 'RPRO') ; \\ (%{collection} == '03') and (%{processing} == 'OFFL') - Specifiy the output file:: + Specifiy the output file:: ! csv file that will hold records per file with: ! - timerange of pixels in file ! - orbit number ! time templates are replaced with todays date .file : /work/inquire/Copernicus_S5p_NO2_colhub-mirror-missing__%Y-%m-%d.csv - + Optionally define a creation mode for the (parent) directories:: ! directory creation mode: .dmode : 0o775 - + An existing listing file is not replaced, unless the following flag is set:: @@ -515,6 +518,154 @@ class CSO_ColHubMirror_Missing(utopya.UtopyaRc): # endclass CSO_ColHubMirror_Missing + +######################################################################## +### +### cleanup mirror extension +### +######################################################################## + + +class CSO_ColHubMirror_Cleanup(utopya.UtopyaRc): + + """ + Cleanup extension to mirror: + + * remove files that are (now) also available in mirror + + In the settings, define the listing files of the mirror and the extension, + eventually replace time templates in filename by a specific date (default: today): + + .mirror.file : /work/inquire/mirror.csv + !.mirror.filedate : 2025-01-24 + + .mirror2.file : /work/inquire/mirror2.csv + !.mirror2.filedate : 2025-01-24 + + If files in the extension are also present in the main mirror archive, + then remove these records from the listing. + The corresponding files are renamed to an extension ``.bak``, + or removed if the following flag is set:: + + ! remove duplicates? otherwise rename to '.bak': + .mirror2.remove_duplicates : False + + Optionally define a creation mode for the (parent) directories:: + + ! directory creation mode: + .dmode : 0o775 + + + """ + + def __init__(self, rcfile, rcbase="", env={}, indent=""): + """ + Convert data. + """ + + # modules: + import os + import datetime + + # import glob + import collections + import fnmatch + + # tools: + from . import cso_file + + # info ... + logging.info(f"{indent}") + logging.info(f"{indent}** cleanup mirror extension") + logging.info(f"{indent}") + + # init base object: + utopya.UtopyaRc.__init__(self, rcfile=rcfile, rcbase=rcbase, env=env) + + # table files in main archive: + listfile1= self.GetSetting("mirror.file") + # evaluate time? + filedate1 = self.GetSetting( + "mirror1.filedate", totype="datetime", default=datetime.datetime.now() + ) + listfile1 = filedate1.strftime(listfile1) + # read: + listing1 = cso_file.CSO_Listing(listfile1) + + # table files in extra archive: + listfile2= self.GetSetting("mirror2.file") + # evaluate time? + filedate2 = self.GetSetting( + "mirror2.filedate", totype="datetime", default=datetime.datetime.now() + ) + listfile2 = filedate2.strftime(listfile2) + # read: + listing2 = cso_file.CSO_Listing(listfile2) + + # remove duplicate files? if not, rename: + remove_duplicates = self.GetSetting("mirror2.remove_duplicates", totype="bool") + + # info ... + logging.info(f"{indent}check on duplicate records in extension ...") + + # list of records to be removed: + labels = [] + # loop over extra files: + for fname2, row2 in listing2.df.iterrows(): + # check if also in main archive: + if fname2 in listing1.df.index: + # info ... + logging.info(f"{indent}file '{fname2}' already in main archive ...") + # actual file: + href = row2["href"] + # check .. + if not os.path.isfile(href): + logging.error(f"file not found at expected location: {href}") + raise Exception + #endif + # what to do? + if remove_duplicates: + # info .. + logging.info(f"{indent} remove: {href}") + # remove: + os.remove( href ) + else: + # info .. + logging.info(f"{indent} rename: {href}") + # rename: + os.rename( href, href+".bak" ) + #endif + # store label for removal from listing: + labels.append( fname2 ) + #endif + #endfor + + # remove files from listing? + if len(labels) == 0: + # info .. + logging.info(f"{indent} no duplicates found ...") + else: + # info .. + logging.info(f"{indent}remove {len(labels)} records from listing ...") + # remove records: + listing2.df.drop( labels, inplace=True ) + # directory creation mode: + dmode = self.GetSetting("dmode", totype="int", default=None) + # save: + listing2.Save(listfile2, dmode=dmode, indent=f"{indent} ") + #endif + + # info ... + logging.info(f"{indent}") + logging.info(f"{indent}** end cleanup") + logging.info(f"{indent}") + + # enddef __init__ + + +# endclass CSO_ColHubMirror_Cleanup + + ######################################################################## ### ### end -- GitLab From 83635e2db009d873f1ef26a807620b3f0cf5311b Mon Sep 17 00:00:00 2001 From: Arjo Segers Date: Wed, 21 Jan 2026 13:35:34 +0100 Subject: [PATCH 2/3] Use mimetype of downloaded product to decide on postprocessing. --- src/cso/cso_dataspace.py | 53 ++++++++++++++++++++++------------------ 1 file changed, 29 insertions(+), 24 deletions(-) diff --git a/src/cso/cso_dataspace.py b/src/cso/cso_dataspace.py index 405f017..4a595ea 100644 --- a/src/cso/cso_dataspace.py +++ b/src/cso/cso_dataspace.py @@ -43,6 +43,9 @@ # 2025-04, Arjo Segers # Changed imports for python packaging. # +# 2025-10, Arjo Segers +# Use mimetype of downloaded product to decide on postprocessing. +# ######################################################################## @@ -775,6 +778,7 @@ class CSO_DataSpace_Downloader(object): import os import time import requests + import magic import zipfile import shutil @@ -821,10 +825,26 @@ class CSO_DataSpace_Downloader(object): # endwith # rename: os.rename(tmpfile, product_file) + + # file type: + mimetype = magic.from_file( product_file, mime=True ) + # switch: + #~ nc file: + if mimetype == "application/x-hdf": + + # info .. + logging.info(f"{indent}product is netcdf file, store ...") + # this is the target netcdf file already; + # create target dir if necessary: + cso_file.CheckDir(output_file, dmode=dmode) + # rename to destination: + shutil.move(product_file, output_file) - # try to open product file; - # first try if it is a zipfile: - try: + #~ zip file: + elif mimetype == "application/zip": + + # info .. + logging.info(f"{indent}product is zip file, unpack ...") # open as zipfile: arch = zipfile.ZipFile(product_file, mode="r") # loop over members, probably two files in a directory: @@ -859,27 +879,12 @@ class CSO_DataSpace_Downloader(object): logging.info(f"{indent}remove product file ...") # remove package: os.remove(product_file) - # - except Exception as err: - # info .. - msg = str(err) - # logging.error("from download; message received:") - # logging.error(" %s" % msg) - # catch known problem ... - if "File is not a zip file" in msg: - # logging.warning(f"{indent}maybe download was interrupted, try again ...") - # info .. - logging.info(f"{indent}product is no zipfile, rename to output file ...") - # this is probably the target file already; - # create target dir if necessary: - cso_file.CheckDir(output_file, dmode=dmode) - # rename to destination: - shutil.move(product_file, output_file) - else: - # quit with error: - raise - # endif - # endtry + + #~ unknown ... + else: + logger.error( f"unsupported mimetype '{mimetype}'" ) + raise Exception + #endif # all ok, leave retry loop: break -- GitLab From 51343099c156e57154ac4c129db016af569673b2 Mon Sep 17 00:00:00 2001 From: Arjo Segers Date: Wed, 21 Jan 2026 13:35:52 +0100 Subject: [PATCH 3/3] Updated messsages. --- src/cso/cso_file.py | 2 +- src/cso/cso_s5p.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/cso/cso_file.py b/src/cso/cso_file.py index aa59f5d..5bb4eda 100644 --- a/src/cso/cso_file.py +++ b/src/cso/cso_file.py @@ -1338,7 +1338,7 @@ class CSO_Listing(object): # endif # info ... - logging.info(f"{indent} read listing {filename} ...") + logging.info(f"{indent}read listing {filename} ...") # read: self.df = pandas.read_csv( filename, diff --git a/src/cso/cso_s5p.py b/src/cso/cso_s5p.py index c20275d..e3a6d77 100644 --- a/src/cso/cso_s5p.py +++ b/src/cso/cso_s5p.py @@ -2879,8 +2879,8 @@ class CSO_S5p_Convert(utopya.UtopyaRc): if len(odf) == 0: continue elif len(odf) > 1: - logging.error(f"found {len(odf)} records matching selection;" + - " use finer selection, or something wrong in inquiry table?" ) + logging.error(f"found multiple records matching selection;" + + " use finer selection, or something wrong in inquiry table(s)?" ) raise Exception #endif # selected record: -- GitLab