TNO Intern

Commit 839e8472 authored by Arjo Segers's avatar Arjo Segers
Browse files

Added class to cleanup extension of ColHub mirror.

parent 241b9313
Loading
Loading
Loading
Loading
+165 −14
Original line number Diff line number Diff line
@@ -24,6 +24,9 @@
# 2025-04, Arjo Segers
#   Changed imports for python packaging.
#
# 2025-09, Arjo Segers
#   Added "CSO_ColHubMirror_Cleanup" class.
#

########################################################################
###
@@ -515,6 +518,154 @@ class CSO_ColHubMirror_Missing(utopya.UtopyaRc):

# endclass CSO_ColHubMirror_Missing


########################################################################
###
### cleanup mirror extension
###
########################################################################


class CSO_ColHubMirror_Cleanup(utopya.UtopyaRc):

    """
    Cleanup extension to mirror:

    * remove files that are (now) also available in mirror

    In the settings, define the listing files of the mirror and the extension,
    eventually replace time templates in filename by a specific date (default: today):

        <rcbase>.mirror.file           :  /work/inquire/mirror.csv
        !<rcbase>.mirror.filedate       :  2025-01-24

        <rcbase>.mirror2.file           :  /work/inquire/mirror2.csv
        !<rcbase>.mirror2.filedate       :  2025-01-24

    If files in the extension are also present in the main mirror archive,
    then remove these records from the listing.
    The corresponding files are renamed to an extension ``.bak``,
    or removed if the following flag is set::

        ! remove duplicates? otherwise rename to '.bak':
        <rcbase>.mirror2.remove_duplicates   :  False

    Optionally define a creation mode for the (parent) directories::

        ! directory creation mode:
        <rcbase>.dmode                         :  0o775


    """

    def __init__(self, rcfile, rcbase="", env={}, indent=""):
        """
        Convert data.
        """

        # modules:
        import os
        import datetime

        # import glob
        import collections
        import fnmatch

        # tools:
        from . import cso_file

        # info ...
        logging.info(f"{indent}")
        logging.info(f"{indent}** cleanup mirror extension")
        logging.info(f"{indent}")

        # init base object:
        utopya.UtopyaRc.__init__(self, rcfile=rcfile, rcbase=rcbase, env=env)

        # table files in main archive:
        listfile1= self.GetSetting("mirror.file")
        # evaluate time?
        filedate1 = self.GetSetting(
            "mirror1.filedate", totype="datetime", default=datetime.datetime.now()
        )
        listfile1 = filedate1.strftime(listfile1)
        # read:
        listing1 = cso_file.CSO_Listing(listfile1)

        # table files in extra archive:
        listfile2= self.GetSetting("mirror2.file")
        # evaluate time?
        filedate2 = self.GetSetting(
            "mirror2.filedate", totype="datetime", default=datetime.datetime.now()
        )
        listfile2 = filedate2.strftime(listfile2)
        # read:
        listing2 = cso_file.CSO_Listing(listfile2)

        # remove duplicate files? if not, rename:
        remove_duplicates = self.GetSetting("mirror2.remove_duplicates", totype="bool")

        # info ...
        logging.info(f"{indent}check on duplicate records in extension ...")

        # list of records to be removed:
        labels = []
        # loop over extra files:
        for fname2, row2 in listing2.df.iterrows():
            # check if also in main archive:
            if fname2 in listing1.df.index:
                # info ...
                logging.info(f"{indent}file '{fname2}' already in main archive ...")
                # actual file:
                href = row2["href"]
                # check ..
                if not os.path.isfile(href):
                    logging.error(f"file not found at expected location: {href}")
                    raise Exception
                #endif
                # what to do?
                if remove_duplicates:
                    # info ..
                    logging.info(f"{indent}  remove: {href}")
                    # remove:
                    os.remove( href )
                else:
                    # info ..
                    logging.info(f"{indent}  rename: {href}")
                    # rename:
                    os.rename( href, href+".bak" )
                #endif
                # store label for removal from listing:
                labels.append( fname2 )
            #endif
        #endfor

        # remove files from listing?
        if len(labels) == 0:
            # info ..
            logging.info(f"{indent}  no duplicates found ...")
        else:
            # info ..
            logging.info(f"{indent}remove {len(labels)} records from listing ...")
            # remove records:
            listing2.df.drop( labels, inplace=True )
            # directory creation mode:
            dmode = self.GetSetting("dmode", totype="int", default=None)
            # save:
            listing2.Save(listfile2, dmode=dmode, indent=f"{indent}  ")
        #endif

        # info ...
        logging.info(f"{indent}")
        logging.info(f"{indent}** end cleanup")
        logging.info(f"{indent}")

    # enddef __init__


# endclass CSO_ColHubMirror_Cleanup


########################################################################
###
### end