TNO Intern

Commit c4bb6f50 authored by Arjo Segers's avatar Arjo Segers
Browse files

Added blacklist for problematic urls

parent 242a6360
Loading
Loading
Loading
Loading
+34 −7
Original line number Diff line number Diff line
@@ -13,6 +13,9 @@
# 2025-04, Arjo Segers
#   Changed imports for python packaging.
#
# 2025-09, Arjo Segers
#   Added 'blacklist' for problematic URL's.
#

########################################################################
###
@@ -383,6 +386,15 @@ class CSO_EarthAccess_Inquire(utopya.UtopyaRc):
                    # update record:
                    listing.UpdateRecord(filename, data, indent=f"{indent}    ")

                    ## ADHOC check on double records ..
                    #dlst = listing.Select( product=data["product"], start_time=data["start_time"], end_time=data["end_time"], processor_version=data["processor_version"] )
                    #if len(dlst) > 1 :
                    #    logging.error( f"record alredy exist?" )
                    #    logging.error( dlst.df )
                    #    logging.error( data )
                    #    raise Exception
                    ##endif

                    ## testing ...
                    # if len(listing) >= 100 :
                    #    logging.warning( f"BREAK after {len(listing)} files ..." )
@@ -462,6 +474,13 @@ class CSO_EarthAccess_Download(utopya.UtopyaRc):
        ! processor version "v2.0.0"
         <rcbase>.processor_version   :  020000

    Some url's seem not to exist anymore, or actually, these are double available from 2 different url's of which 1 does not work anymore.
    As temporary solution these could be blacklisted::

        ! skip some problematic url's:
        <rcbase>.blacklist            :  https://ladsweb.modaps.eosdis.nasa.gov/archive/allData/5200/AERDB_L2_VIIRS_SNPP/2024/103/AERDB_L2_VIIRS_SNPP.A2024103.0836.002.2024106154554.nc \
                                         https://ladsweb.modaps.eosdis.nasa.gov/archive/allData/5200/AERDB_L2_VIIRS_SNPP/2024/103/AERDB_L2_VIIRS_SNPP.A2024103.1348.002.2024106155539.nc

    Specify the directory where the input files are to be searched,
    or where to download them to if not present yet::

@@ -566,8 +585,8 @@ class CSO_EarthAccess_Download(utopya.UtopyaRc):
        logging.info(f"{indent}selection:")
        logging.info(f"{indent}  processor version: {processor_version}")

        ## skip some?
        # blacklist = self.GetSetting("blacklist", default="").split()
        # skip some?
        blacklist = self.GetSetting("blacklist", default="").split()

        # target directory, including time templates:
        arch_dir__template = self.GetSetting("dir")
@@ -594,6 +613,14 @@ class CSO_EarthAccess_Download(utopya.UtopyaRc):
            # info ...
            logging.info(f"{indent}{basename} ...")

            # check ..
            if rec["href"] in blacklist:
                # info ...
                logging.info(f"{indent}  download url is blacklisted, skip ...")
                # next record:
                continue
            #endif

            # expand time templates
            arch_dir = rec["start_time"].strftime(arch_dir__template)
            # full path:
@@ -700,7 +727,7 @@ class CSO_EarthAccess_Download_Listing(utopya.UtopyaRc):
    """
    Create *listing* file for files downloaded from VIIRS data portals.

    A *listing* file contains the names of the converted orbit files,
    A *listing* file contains the names of orbit files,
    the time range of pixels in the file, and other information extracted from the filenames or file attributes::

        filename                                                       ;start_time         ;end_time           ;orbit