TNO Intern

Commit c98d43dd authored by Arjo Segers's avatar Arjo Segers
Browse files

Merge branch 'merge-sesam' into 'master'

Merge sesam

See merge request !19
parents 241b9313 51343099
Loading
Loading
Loading
Loading
Loading
+165 −14
Original line number Diff line number Diff line
@@ -24,6 +24,9 @@
# 2025-04, Arjo Segers
#   Changed imports for python packaging.
#
# 2025-09, Arjo Segers
#   Added "CSO_ColHubMirror_Cleanup" class.
#

########################################################################
###
@@ -515,6 +518,154 @@ class CSO_ColHubMirror_Missing(utopya.UtopyaRc):

# endclass CSO_ColHubMirror_Missing


########################################################################
###
### cleanup mirror extension
###
########################################################################


class CSO_ColHubMirror_Cleanup(utopya.UtopyaRc):

    """
    Cleanup extension to mirror:

    * remove files that are (now) also available in mirror

    In the settings, define the listing files of the mirror and the extension,
    eventually replace time templates in filename by a specific date (default: today):

        <rcbase>.mirror.file           :  /work/inquire/mirror.csv
        !<rcbase>.mirror.filedate       :  2025-01-24

        <rcbase>.mirror2.file           :  /work/inquire/mirror2.csv
        !<rcbase>.mirror2.filedate       :  2025-01-24

    If files in the extension are also present in the main mirror archive,
    then remove these records from the listing.
    The corresponding files are renamed to an extension ``.bak``,
    or removed if the following flag is set::

        ! remove duplicates? otherwise rename to '.bak':
        <rcbase>.mirror2.remove_duplicates   :  False

    Optionally define a creation mode for the (parent) directories::

        ! directory creation mode:
        <rcbase>.dmode                         :  0o775


    """

    def __init__(self, rcfile, rcbase="", env={}, indent=""):
        """
        Convert data.
        """

        # modules:
        import os
        import datetime

        # import glob
        import collections
        import fnmatch

        # tools:
        from . import cso_file

        # info ...
        logging.info(f"{indent}")
        logging.info(f"{indent}** cleanup mirror extension")
        logging.info(f"{indent}")

        # init base object:
        utopya.UtopyaRc.__init__(self, rcfile=rcfile, rcbase=rcbase, env=env)

        # table files in main archive:
        listfile1= self.GetSetting("mirror.file")
        # evaluate time?
        filedate1 = self.GetSetting(
            "mirror1.filedate", totype="datetime", default=datetime.datetime.now()
        )
        listfile1 = filedate1.strftime(listfile1)
        # read:
        listing1 = cso_file.CSO_Listing(listfile1)

        # table files in extra archive:
        listfile2= self.GetSetting("mirror2.file")
        # evaluate time?
        filedate2 = self.GetSetting(
            "mirror2.filedate", totype="datetime", default=datetime.datetime.now()
        )
        listfile2 = filedate2.strftime(listfile2)
        # read:
        listing2 = cso_file.CSO_Listing(listfile2)

        # remove duplicate files? if not, rename:
        remove_duplicates = self.GetSetting("mirror2.remove_duplicates", totype="bool")

        # info ...
        logging.info(f"{indent}check on duplicate records in extension ...")

        # list of records to be removed:
        labels = []
        # loop over extra files:
        for fname2, row2 in listing2.df.iterrows():
            # check if also in main archive:
            if fname2 in listing1.df.index:
                # info ...
                logging.info(f"{indent}file '{fname2}' already in main archive ...")
                # actual file:
                href = row2["href"]
                # check ..
                if not os.path.isfile(href):
                    logging.error(f"file not found at expected location: {href}")
                    raise Exception
                #endif
                # what to do?
                if remove_duplicates:
                    # info ..
                    logging.info(f"{indent}  remove: {href}")
                    # remove:
                    os.remove( href )
                else:
                    # info ..
                    logging.info(f"{indent}  rename: {href}")
                    # rename:
                    os.rename( href, href+".bak" )
                #endif
                # store label for removal from listing:
                labels.append( fname2 )
            #endif
        #endfor

        # remove files from listing?
        if len(labels) == 0:
            # info ..
            logging.info(f"{indent}  no duplicates found ...")
        else:
            # info ..
            logging.info(f"{indent}remove {len(labels)} records from listing ...")
            # remove records:
            listing2.df.drop( labels, inplace=True )
            # directory creation mode:
            dmode = self.GetSetting("dmode", totype="int", default=None)
            # save:
            listing2.Save(listfile2, dmode=dmode, indent=f"{indent}  ")
        #endif

        # info ...
        logging.info(f"{indent}")
        logging.info(f"{indent}** end cleanup")
        logging.info(f"{indent}")

    # enddef __init__


# endclass CSO_ColHubMirror_Cleanup


########################################################################
###
### end
+29 −24
Original line number Diff line number Diff line
@@ -43,6 +43,9 @@
# 2025-04, Arjo Segers
#   Changed imports for python packaging.
#
# 2025-10, Arjo Segers
#   Use mimetype of downloaded product to decide on postprocessing.
#


########################################################################
@@ -775,6 +778,7 @@ class CSO_DataSpace_Downloader(object):
        import os
        import time
        import requests
        import magic
        import zipfile
        import shutil

@@ -822,9 +826,25 @@ class CSO_DataSpace_Downloader(object):
                # rename:
                os.rename(tmpfile, product_file)
                
                # try to open product file;
                # first try if it is a zipfile:
                try:
                # file type:
                mimetype = magic.from_file( product_file, mime=True )
                # switch:
                #~ nc file:
                if mimetype == "application/x-hdf":

                    # info ..
                    logging.info(f"{indent}product is netcdf file, store ...")
                    # this is the target netcdf file already;
                    # create target dir if necessary:
                    cso_file.CheckDir(output_file, dmode=dmode)
                    # rename to destination:
                    shutil.move(product_file, output_file)

                #~ zip file:
                elif mimetype == "application/zip":

                    # info ..
                    logging.info(f"{indent}product is zip file, unpack ...")
                    # open as zipfile:
                    arch = zipfile.ZipFile(product_file, mode="r")
                    # loop over members, probably two files in a directory:
@@ -859,27 +879,12 @@ class CSO_DataSpace_Downloader(object):
                    logging.info(f"{indent}remove product file ...")
                    # remove package:
                    os.remove(product_file)
                #
                except Exception as err:
                    # info ..
                    msg = str(err)
                    # logging.error("from download; message received:")
                    # logging.error("  %s" % msg)
                    # catch known problem ...
                    if "File is not a zip file" in msg:
                        # logging.warning(f"{indent}maybe download was interrupted, try again  ...")
                        # info ..
                        logging.info(f"{indent}product is no zipfile, rename to output file ...")
                        # this is probably the target file already;
                        # create target dir if necessary:
                        cso_file.CheckDir(output_file, dmode=dmode)
                        # rename to destination:
                        shutil.move(product_file, output_file)
                
                #~ unknown ...
                else:
                        # quit with error:
                        raise
                    logger.error( f"unsupported mimetype '{mimetype}'" )
                    raise Exception
                #endif
                # endtry

                # all ok, leave retry loop:
                break
+2 −2
Original line number Diff line number Diff line
@@ -2879,8 +2879,8 @@ class CSO_S5p_Convert(utopya.UtopyaRc):
            if len(odf) == 0:
                continue
            elif len(odf) > 1:
                logging.error(f"found {len(odf)} records matching selection;" +
                    " use finer selection, or something wrong in inquiry table?" )
                logging.error(f"found multiple records matching selection;" +
                    " use finer selection, or something wrong in inquiry table(s)?" )
                raise Exception
            #endif
            # selected record:
+1 −1

File changed.

Contains only whitespace changes.