From 839e8472f777fa8316915881af97977ec003f476 Mon Sep 17 00:00:00 2001
From: Arjo Segers <arjos@met.no>
Date: Wed, 21 Jan 2026 13:34:58 +0100
Subject: [PATCH 1/3] Added class to cleanup extension of ColHub mirror.

---
 src/cso/cso_colhub.py | 179 ++++++++++++++++++++++++++++++++++++++----
 1 file changed, 165 insertions(+), 14 deletions(-)
diff --git a/src/cso/cso_colhub.py b/src/cso/cso_colhub.py
index 88056f9..1e0a4a4 100644
--- a/src/cso/cso_colhub.py
+++ b/src/cso/cso_colhub.py
@@ -24,6 +24,9 @@
 # 2025-04, Arjo Segers
 #   Changed imports for python packaging.
 #
+# 2025-09, Arjo Segers
+#   Added "CSO_ColHubMirror_Cleanup" class.
+#
 
 ########################################################################
 ###
@@ -38,16 +41,16 @@
 ``cso_colhub`` module
 *********************
 
-The :py:mod:`cso.cso_colhub` module provides classes for accessing  data from the 
+The :py:mod:`cso.cso_colhub` module provides classes for accessing  data from the
 Norwegian `ColHub <https://colhub.met.no>`_ archive.
-This is a (partial) mirror of the `Copernicus Open Access Hub <https://colhub.copernicus.eu/>`_ 
+This is a (partial) mirror of the `Copernicus Open Access Hub <https://colhub.copernicus.eu/>`_
 with all orbits covering Norway and surrounding areas.
 
-The data can be accessed using a web-interface, 
+The data can be accessed using a web-interface,
 but for Met Norway users also directly from the storage.
-This module provides the :py:class:`.CSO_ColHubMirror_Inquire` to inquire the storage 
+This module provides the :py:class:`.CSO_ColHubMirror_Inquire` to inquire the storage
 to see what is already available.
-In addition, the :py:class:`.CSO_ColHubMirror_Missing` is available to create a table 
+In addition, the :py:class:`.CSO_ColHubMirror_Missing` is available to create a table
 that lists files that are in the DataSpace but not yet in the mirror;
 these could be used to make additional downloads.
 
@@ -312,7 +315,7 @@ class CSO_ColHubMirror_Missing(utopya.UtopyaRc):
     """
     Create *listing* file for files that are in one inquiry table but not in another one.
     This could be used to complete a mirror archive.
-    
+
     The format is similar to the output of *inquiry* classes,
     with per line a filename, the time range of pixels in the file, and other information extracted from the filenames::
 
@@ -322,19 +325,19 @@ class CSO_ColHubMirror_Missing(utopya.UtopyaRc):
 
     In the settings, define the listing file with all available data, for example the result of an inquiry step;
     eventually add a timestamp to replace templates in the filename::
-    
+
         <rcbase>.all.file           :  /work/inquire/Copernicus_S5p_NO2_dataspace__%Y-%m-%d.csv
         !<rcbase>.all.filedate       :  2025-01-24
-        
-    Similar specify the name (or ";" seperated list of names of the file that is listing the current mirror(s), 
+
+    Similar specify the name (or ";" seperated list of names of the file that is listing the current mirror(s),
     probably the output of the :py:class:`CSO_ColHubMirror_Inquire` class::
 
         <rcbase>.curr.file           :  /work/inquire/Copernicus_S5p_NO2_colhub-mirror__%Y-%m-%d.csv ; \\
                                         /work/inquire/Copernicus_S5p_NO2_colhub-mirror2__%Y-%m-%d.csv
         !<rcbase>.curr.filedate       :  2025-01-24
-        
+
     Specify a selection filter; this defines which of the orbit files are actually needed::
-   
+
         ! Provide ';' seperated list of to decide if a particular orbit file should be processed.
         ! If more than one file is available for a particular orbit (from "OFFL" and "RPRO" processing),
         ! the file with the first match will be used.
@@ -343,19 +346,19 @@ class CSO_ColHubMirror_Missing(utopya.UtopyaRc):
         <rcbase>.selection                     :  (%{collection} == '03') and (%{processing} == 'RPRO') ; \\
                                                   (%{collection} == '03') and (%{processing} == 'OFFL')
 
-    Specifiy the output file::    
+    Specifiy the output file::
 
         ! csv file that will hold records per file with:
         ! - timerange of pixels in file
         ! - orbit number
         ! time templates are replaced with todays date
         <rcbase>.file        :  /work/inquire/Copernicus_S5p_NO2_colhub-mirror-missing__%Y-%m-%d.csv
-        
+
     Optionally define a creation mode for the (parent) directories::
 
         ! directory creation mode:
         <rcbase>.dmode                         :  0o775
-    
+
     An existing listing file is not replaced,
     unless the following flag is set::
 
@@ -515,6 +518,154 @@ class CSO_ColHubMirror_Missing(utopya.UtopyaRc):
 
 # endclass CSO_ColHubMirror_Missing
 
+
+########################################################################
+###
+### cleanup mirror extension
+###
+########################################################################
+
+
+class CSO_ColHubMirror_Cleanup(utopya.UtopyaRc):
+
+    """
+    Cleanup extension to mirror:
+
+    * remove files that are (now) also available in mirror
+
+    In the settings, define the listing files of the mirror and the extension,
+    eventually replace time templates in filename by a specific date (default: today):
+
+        <rcbase>.mirror.file           :  /work/inquire/mirror.csv
+        !<rcbase>.mirror.filedate       :  2025-01-24
+
+        <rcbase>.mirror2.file           :  /work/inquire/mirror2.csv
+        !<rcbase>.mirror2.filedate       :  2025-01-24
+
+    If files in the extension are also present in the main mirror archive,
+    then remove these records from the listing.
+    The corresponding files are renamed to an extension ``.bak``,
+    or removed if the following flag is set::
+
+        ! remove duplicates? otherwise rename to '.bak':
+        <rcbase>.mirror2.remove_duplicates   :  False
+
+    Optionally define a creation mode for the (parent) directories::
+
+        ! directory creation mode:
+        <rcbase>.dmode                         :  0o775
+
+
+    """
+
+    def __init__(self, rcfile, rcbase="", env={}, indent=""):
+        """
+        Convert data.
+        """
+
+        # modules:
+        import os
+        import datetime
+
+        # import glob
+        import collections
+        import fnmatch
+
+        # tools:
+        from . import cso_file
+
+        # info ...
+        logging.info(f"{indent}")
+        logging.info(f"{indent}** cleanup mirror extension")
+        logging.info(f"{indent}")
+
+        # init base object:
+        utopya.UtopyaRc.__init__(self, rcfile=rcfile, rcbase=rcbase, env=env)
+
+        # table files in main archive:
+        listfile1= self.GetSetting("mirror.file")
+        # evaluate time?
+        filedate1 = self.GetSetting(
+            "mirror1.filedate", totype="datetime", default=datetime.datetime.now()
+        )
+        listfile1 = filedate1.strftime(listfile1)
+        # read:
+        listing1 = cso_file.CSO_Listing(listfile1)
+
+        # table files in extra archive:
+        listfile2= self.GetSetting("mirror2.file")
+        # evaluate time?
+        filedate2 = self.GetSetting(
+            "mirror2.filedate", totype="datetime", default=datetime.datetime.now()
+        )
+        listfile2 = filedate2.strftime(listfile2)
+        # read:
+        listing2 = cso_file.CSO_Listing(listfile2)
+
+        # remove duplicate files? if not, rename:
+        remove_duplicates = self.GetSetting("mirror2.remove_duplicates", totype="bool")
+
+        # info ...
+        logging.info(f"{indent}check on duplicate records in extension ...")
+
+        # list of records to be removed:
+        labels = []
+        # loop over extra files:
+        for fname2, row2 in listing2.df.iterrows():
+            # check if also in main archive:
+            if fname2 in listing1.df.index:
+                # info ...
+                logging.info(f"{indent}file '{fname2}' already in main archive ...")
+                # actual file:
+                href = row2["href"]
+                # check ..
+                if not os.path.isfile(href):
+                    logging.error(f"file not found at expected location: {href}")
+                    raise Exception
+                #endif
+                # what to do?
+                if remove_duplicates:
+                    # info ..
+                    logging.info(f"{indent}  remove: {href}")
+                    # remove:
+                    os.remove( href )
+                else:
+                    # info ..
+                    logging.info(f"{indent}  rename: {href}")
+                    # rename:
+                    os.rename( href, href+".bak" )
+                #endif
+                # store label for removal from listing:
+                labels.append( fname2 )
+            #endif
+        #endfor
+
+        # remove files from listing?
+        if len(labels) == 0:
+            # info ..
+            logging.info(f"{indent}  no duplicates found ...")
+        else:
+            # info ..
+            logging.info(f"{indent}remove {len(labels)} records from listing ...")
+            # remove records:
+            listing2.df.drop( labels, inplace=True )
+            # directory creation mode:
+            dmode = self.GetSetting("dmode", totype="int", default=None)
+            # save:
+            listing2.Save(listfile2, dmode=dmode, indent=f"{indent}  ")
+        #endif
+
+        # info ...
+        logging.info(f"{indent}")
+        logging.info(f"{indent}** end cleanup")
+        logging.info(f"{indent}")
+
+    # enddef __init__
+
+
+# endclass CSO_ColHubMirror_Cleanup
+
+
 ########################################################################
 ###
 ### end
-- 
GitLab


From 83635e2db009d873f1ef26a807620b3f0cf5311b Mon Sep 17 00:00:00 2001
From: Arjo Segers <arjos@met.no>
Date: Wed, 21 Jan 2026 13:35:34 +0100
Subject: [PATCH 2/3] Use mimetype of downloaded product to decide on
 postprocessing.

---
 src/cso/cso_dataspace.py | 53 ++++++++++++++++++++++------------------
 1 file changed, 29 insertions(+), 24 deletions(-)

diff --git a/src/cso/cso_dataspace.py b/src/cso/cso_dataspace.py
index 405f017..4a595ea 100644
--- a/src/cso/cso_dataspace.py
+++ b/src/cso/cso_dataspace.py
@@ -43,6 +43,9 @@
 # 2025-04, Arjo Segers
 #   Changed imports for python packaging.
 #
+# 2025-10, Arjo Segers
+#   Use mimetype of downloaded product to decide on postprocessing.
+#
 
 
 ########################################################################
@@ -775,6 +778,7 @@ class CSO_DataSpace_Downloader(object):
         import os
         import time
         import requests
+        import magic
         import zipfile
         import shutil
 
@@ -821,10 +825,26 @@ class CSO_DataSpace_Downloader(object):
                 # endwith
                 # rename:
                 os.rename(tmpfile, product_file)
+                
+                # file type:
+                mimetype = magic.from_file( product_file, mime=True )
+                # switch:
+                #~ nc file:
+                if mimetype == "application/x-hdf":
+
+                    # info ..
+                    logging.info(f"{indent}product is netcdf file, store ...")
+                    # this is the target netcdf file already;
+                    # create target dir if necessary:
+                    cso_file.CheckDir(output_file, dmode=dmode)
+                    # rename to destination:
+                    shutil.move(product_file, output_file)
 
-                # try to open product file;
-                # first try if it is a zipfile:
-                try:
+                #~ zip file:
+                elif mimetype == "application/zip":
+
+                    # info ..
+                    logging.info(f"{indent}product is zip file, unpack ...")
                     # open as zipfile:
                     arch = zipfile.ZipFile(product_file, mode="r")
                     # loop over members, probably two files in a directory:
@@ -859,27 +879,12 @@ class CSO_DataSpace_Downloader(object):
                     logging.info(f"{indent}remove product file ...")
                     # remove package:
                     os.remove(product_file)
-                #
-                except Exception as err:
-                    # info ..
-                    msg = str(err)
-                    # logging.error("from download; message received:")
-                    # logging.error("  %s" % msg)
-                    # catch known problem ...
-                    if "File is not a zip file" in msg:
-                        # logging.warning(f"{indent}maybe download was interrupted, try again  ...")
-                        # info ..
-                        logging.info(f"{indent}product is no zipfile, rename to output file ...")
-                        # this is probably the target file already;
-                        # create target dir if necessary:
-                        cso_file.CheckDir(output_file, dmode=dmode)
-                        # rename to destination:
-                        shutil.move(product_file, output_file)
-                    else:
-                        # quit with error:
-                        raise
-                    # endif
-                # endtry
+                
+                #~ unknown ...
+                else:
+                    logger.error( f"unsupported mimetype '{mimetype}'" )
+                    raise Exception
+                #endif
 
                 # all ok, leave retry loop:
                 break
-- 
GitLab


From 51343099c156e57154ac4c129db016af569673b2 Mon Sep 17 00:00:00 2001
From: Arjo Segers <arjos@met.no>
Date: Wed, 21 Jan 2026 13:35:52 +0100
Subject: [PATCH 3/3] Updated messsages.

---
 src/cso/cso_file.py | 2 +-
 src/cso/cso_s5p.py  | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/cso/cso_file.py b/src/cso/cso_file.py
index aa59f5d..5bb4eda 100644
--- a/src/cso/cso_file.py
+++ b/src/cso/cso_file.py
@@ -1338,7 +1338,7 @@ class CSO_Listing(object):
             # endif
 
             # info ...
-            logging.info(f"{indent} read listing {filename} ...")
+            logging.info(f"{indent}read listing {filename} ...")
             # read:
             self.df = pandas.read_csv(
                 filename,
diff --git a/src/cso/cso_s5p.py b/src/cso/cso_s5p.py
index c20275d..e3a6d77 100644
--- a/src/cso/cso_s5p.py
+++ b/src/cso/cso_s5p.py
@@ -2879,8 +2879,8 @@ class CSO_S5p_Convert(utopya.UtopyaRc):
             if len(odf) == 0:
                 continue
             elif len(odf) > 1:
-                logging.error(f"found {len(odf)} records matching selection;" +
-                    " use finer selection, or something wrong in inquiry table?" )
+                logging.error(f"found multiple records matching selection;" +
+                    " use finer selection, or something wrong in inquiry table(s)?" )
                 raise Exception
             #endif
             # selected record:
-- 
GitLab