Module `LibsvmDataset`

Expand source code

import os
from urllib.request import urlretrieve
from urllib.error import HTTPError
import progressbar
from sklearn.datasets import load_svmlight_file, dump_svmlight_file
import numpy as np
import glob
# import subprocess
# import shlex

class _bcolors:
    """
        Define colors for terminal output texts.
    """
    HEADER = '\033[95m'
    OKBLUE = '\033[94m'
    OKCYAN = '\033[96m'
    OKGREEN = '\033[92m'
    WARNING = '\033[93m'
    FAIL = '\033[91m'
    ENDC = '\033[0m'
    BOLD = '\033[1m'
    UNDERLINE = '\033[4m'

class LibsvmDataset:
    def __init__(self, download_dir="./raw", 
                 cleand_dir="./clean"):
        """
        Initialize the  LibsvmDataset class.
        
        Args:
            download_dir: A string specifies the place to store the downloaded raw dataset.
            cleand_dir: A string specifies the place to store the cleaned dataset.
        """
        self.download_dir = download_dir
        self.cleand_dir = cleand_dir
        for directory in [self.download_dir, self.cleand_dir]:
            if not os.path.exists(directory):
                os.makedirs(directory)
        self.url_regression = "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/regression"
        self.url_binary = "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary"
        self.data_binary = [
            'a1a', 'a2a', 'a3a', 'a4a', 'a5a', 'a6a', 'a7a', 'a8a', 'a9a',
            'a1a.t', 'a2a.t', 'a3a.t', 'a4a.t', 'a5a.t', 'a6a.t', 'a7a.t', 'a8a.t', 'a9a.t', 
            'australian',
            'breast-cancer',
            'cod-rna', 'cod-rna.t', 'cod-rna.r', 
            'colon-cancer.bz2',
            'covtype.libsvm.binary.bz2',
            'diabetes',
            'duke.bz2',
            'fourclass',
            'german.numer',
            'gisette_scale.bz2', 'gisette_scale.t.bz2',
            'heart',
            'ijcnn1.bz2',
            'ionosphere_scale',
            'leu.bz2', 'leu.bz2.t',
            'liver-disorders', 'liver-disorders.t',
            'mushrooms',
            'phishing',
            'skin_nonskin',
            'splice', 'splice.t',
            'sonar_scale',
            'svmguide1', 'svmguide1.t', 'svmguide3', 'svmguide3.t', 
            'w1a', 'w2a', 'w3a', 'w4a', 'w5a', 'w6a', 'w7a', 'w8a',
            'w1a.t', 'w2a.t', 'w3a.t', 'w4a.t', 'w5a.t', 'w6a.t', 'w7a.t', 'w8a.t'
             #'epsilon_normalized.bz2', 'epsilon_normalized.t.bz2'
             #'HIGGS.bz2',
             #'madelon', 'madelon.t',
             #'news20.binary.bz2',
             #'rcv1_train.binary.bz2','rcv1_test.binary.bz2',
             #'real-sim.bz2',
        ]        
        self.data_regression = [
            'abalone',
            'bodyfat',
            'cadata',
            'cpusmall',
            'log1p.E2006.train.bz2', 'log1p.E2006.test.bz2 '
            'E2006.train.bz2', 'E2006.test.bz2',
            'eunite2001', 'eunite2001.t', 'eunite2001.m',
            'housing',
            'mg',
            'mpg',
            'pyrim',
            'space_ga',
            'triazines',
            'YearPredictionMSD.bz2', 'YearPredictionMSD.t.bz2'
        ]       
        self.task_dict = {"binary":{"url":self.url_binary, 
                                    "dataset":self.data_binary}, 
                          "regression":{"url":self.url_regression, 
                                        "dataset":self.data_regression}}  
        # for printing
        self.pbar = None
        
    def _show_progress(self, block_num, block_size, total_size):
        """
            private function. Show the progress of urlretrieve for downloading data.
        """
        if self.pbar is None:
            self.pbar = progressbar.ProgressBar(maxval=total_size)
            self.pbar.start()

        downloaded = block_num * block_size
        if downloaded < total_size:
            self.pbar.update(downloaded)
        else:
            self.pbar.finish()
            self.pbar = None
    
    def _parseInputs(self, task=None, dataset=None, download_url=None):
        if task is not None and dataset is not None:
            print("You choose to use the task+dataset option.")
            try:
                work_dict = self.task_dict[task]
            except KeyError:
                print(f"{_bcolors.WARNING}Warning:Your input taks is [{task}], which currently is not supported.\n"\
                      f"However, you can provide an url pointing to the desired dataset to download it.{_bcolors.ENDC}")
                return
            is_available = dataset in work_dict["dataset"]
            if not is_available:
                print(f"{_bcolors.FAIL}Error occurs!\n"\
                     f"  1.Either the input dataset:[{dataset}] is not intended for the task:[{task}].\n"\
                     f"  2.Or the input dataset:[{dataset}] is not supported.\n"\
                     f"If you are sure the latter case happens, you can provide an url pointing to the desired dataset.{_bcolors.ENDC}"
                     )
                return
            self.download_url = work_dict["url"] + "/" + dataset
            self.task = task
            self.dataset = dataset
        elif download_url:
            print("You choose to use the url option.")
            try:
                task, dataset = download_url.split("/")[-2], download_url.split("/")[-1]
                self.download_url = download_url
                self.task = task
                self.dataset = dataset
            except IndexError:
                self.download_url = None
                print(f"{_bcolors.FAIL}The input url {download_url} is wrong.{_bcolors.ENDC}")
        else:
            raise ValueError(f"{_bcolors.FAIL}Code has bugs.{_bcolors.ENDC}")
        if self.download_url:
            print(f"Parsed task: [{self.task}] | Parsed dataset: [{self.dataset}]\nParsed download_url:[{self.download_url}]")
    def _getData(self, force_download):
        """
            Use urllib.request::urlretrieve to download the self.dataset and save to 
            self.download_dir/self.task based on the self.download_url.
            If the dataset already exists, then it will skip. However, you can force download
            by setting force_download=True.
        """
        if self.download_url is not None:
            # check whether the dataset is being downloaded or not
            directory = f"{self.download_dir}/{self.task}"
            if not os.path.exists(directory):
                os.makedirs(directory)
            is_downloaded = os.path.exists(f"{directory}/{self.dataset}")
            if not is_downloaded or force_download:
                try:
                    urlretrieve(self.download_url, f'{directory}/{self.dataset}', self._show_progress)
                except HTTPError:
                    print(f"{_bcolors.FAIL}The input or parsed url {self.download_url} is wrong.\n"\
                          f"Possible cause is that either the dataset:[{self.dataset}] or the task:[{self.task}]"\
                          f"is not specified correctly.{_bcolors.ENDC}")
                #print("Start downloading... It may take a while")
                #subprocess.run(['wget', '-i', self.download_url, '-P', self.download_dir, 
                #                '-O', f'{self.download_dir}/{self.dataset}'])
                if os.path.exists(f"{directory}/{self.dataset}"):
                    self.download_success = True
                    print(f"{_bcolors.OKGREEN}dataset [{self.dataset}] is downloaded at [{directory}].{_bcolors.ENDC}")
            else:
                print(f"{_bcolors.WARNING}The dataset [{self.dataset}] already exists in [{directory}]!{_bcolors.ENDC}")
    def _cleanData(self, normalization, binary_label, force_clean, clean_verbose):
        """
            Load the raw dataset from self.download_dir.
            If normalization is set, it will perform appropriate normalization.
            See docstring of the getAndClean to find valid values.
            
            If the cleaned dataset already exists, then it will skip. 
            However, you can force cleaning the dataset by setting force_clean=True.
        """
        directory = f"{self.cleand_dir}/{self.task}"
        if not os.path.exists(directory):
            os.makedirs(directory)
        is_cleaned = os.path.exists(f"{directory}/{self.dataset}")
        if self.download_success:
            if not is_cleaned or force_clean:
                try:
                    data = load_svmlight_file(f"{self.download_dir}/{self.task}/{self.dataset}")
                except ValueError:
                    print(f"{_bcolors.WARNING}Although, you alredy download the data file, the input or parsed url\n"\
                          f"{self.download_url} is wrong. The chance are that you miss the file extension.\n"\
                          f"For exmaple, you should use duke.bz2 instead of the duke for the dataset argument.\n"
                          f"Please check your input dataset:[{self.dataset}].{_bcolors.ENDC}")
                    os.remove(f"{self.download_dir}/{self.task}/{self.dataset}")
                    print(f"{_bcolors.WARNING}Therefore, no cleaning is performed and the file in {self.download_dir}/{self.task}/{self.dataset} is removed!{_bcolors.ENDC}")
                    return
                X, y= data[0], data[1]
                n, p = X.shape
                # check label for the binary task
                if self.task == 'binary':
                    y1old, y2old = np.unique(y)
                    if binary_label is not None:
                        if  binary_label=='{-1,1}':
                            y1new, y2new = -1.0, 1.0
                        elif binary_label=='{0,1}':
                            y1new, y2new = 0.0, 1.0
                        else:
                            raise ValueError(f"Unrecognized binary_level: {binary_label}")
                        y[y==y1old] = y1new
                        y[y==y2old] = y2new
                        print(f"Original y-label range: {{ {y1old}, {y2old} }} -> New y-label range: {{ {np.unique(y)[0]}, {np.unique(y)[1]} }}")
                    else:
                        raise ValueError(f"{_bcolors.FAIL}You should set the desired binary_level.\n For example, binary_label='[-1,1]'.{_bcolors.ENDC}")
                # check feature range
                if normalization is not None:
                    print(f"Perform normalization:{normalization}")
                    if normalization == 'feat-11':
                        for i in range(p):
                            temp = X[:,i]
                            if np.max(temp) > 1.0 or np.min(temp) < -1.0:
                                X[:,i] /= np.max(np.abs(temp))
                                if clean_verbose:
                                    print(f"  col:{i}: max:{np.max(temp):3.3e} | min:{np.min(temp):3.3e}\n"\
                                           "  Apply feature-wise [-1,1] scaling...")
                    elif normalization == 'feat01':
                        for i in range(p):
                            temp = X[:,i]
                            xmax, xmin = np.max(temp), np.min(temp)
                            if xmax > 1.0 or  xmin < 0.0:
                                X[:,i] = (X[:,i] - xmin) / (xmax - xmin)
                                if clean_verbose:
                                    print(f"  col:{i}: max:{np.max(temp):3.3e} | min:{np.min(temp):3.3e}\n"\
                                           "  Apply feature-wise [0,1] scaling...")
                    else:
                        raise ValueError(f"{_bcolors.FAIL}Unrecognized normalization: {normalization}{_bcolors.ENDC}")
                dump_svmlight_file(X, y, f"{directory}/{self.dataset}")
                if os.path.exists(f"{directory}/{self.dataset}"):
                    self.clean_success = True
                    print(f"{_bcolors.OKGREEN}Success: File saved at [{directory}]!{_bcolors.ENDC}")
                    print("-*"*30)
            else:
                 print(f"{_bcolors.WARNING}The cleaned dataset [{self.dataset}] already exists in [{directory}]!{_bcolors.ENDC}")
        else:
            if not is_cleaned:
                print(f"{_bcolors.WARNING}The dataset [{self.dataset}] does not exist in [{self.download_dir}/{self.task}]! No cleaning is performed.{_bcolors.ENDC}")
    def getAvailable(self):
            """Show supported tasks and for each supported task show available datasets.
            Typical usage example:
            
                   libsvm = LibsvmDataset()
                   libsvm.getAvailable()
            """
            print("Current supported tasks are:")
            for k in self.task_dict.keys():
                print(f" ['{k}']", end="")
            print("\n=====================================")
            for k in self.task_dict.keys():
                print(f"For task:['{k}'], available datasets are:")
                print("----------------------------------------------------")
                dataset_lst = []
                for i,d in enumerate(self.task_dict[k]["dataset"]):
                    print(f" '{d}'", end=",")
                    if (i +1) % 5 == 0:
                        print("\n")
                print("\n")        
            print("\n")
    def getAndClean(self, task=None, dataset=None, download_url=None, 
                    binary_lable='{-1,1}', normalization='feat-11',
                    force_download=False, force_clean=False, clean_verbose=True
                   ):
        """Download and clean the dataset.
        Typical usage example:

              libsvm = LibsvmDataset()
              # usage 1
              libsvm.getAndClean(task="binary", dataset="a1a", binary_lable='{-1,1}', normalization='feat-11')
              # usage 2
              libsvm.getAndClean(task="regression", dataset="abalone", normalization='feat-11')
              # usage 3
              libsvm.getAndClean(url='https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/avazu-app.bz2',
                                 binary_lable='{-1,1}', normalization='feat-11')    
        Warning:
            To get the correct dataset name, one can either use `getAvailable` method to see all available datasets.
            Or visit https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/ and get the name of desired datasets.
            For example, given the download link: https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/avazu-app.bz2
            the dataset name is avazu-app.bz2 (please keep the .bz2 extension).
        
        Args:

            task: A string specifies the task you wish to perform. Currently supported {'binary', 'regression'}.
            dataset: A string specifies the dataset you want to download. Use `getAvailable` method to show all
                     currently available datasets for any given task.
            download_url: If the desired dataset is not provided for a given task, one can directly provide a url 
                          link to the desired data set. For example, one wants to download the avazu dataset.
                          One can visit https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary.html#avazu.
                          If you want to download the "avazu-app.bz2" instance, you can right-click its name and 
                          select "copy link", then you should get a plain text as
                              <https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/avazu-app.bz2>
                          Then just provides it as a string.
            binary_label: If you want to perform binary classification, one can set labels to {-1,1} by providing
                          '{-1,1}'; or set labels to {0,1} by providing '{0,1}'. The default is '{-1,1}'.
            normalization: Perform feature-wise normalization. Currently supported options:
                             'feat-11': feature-wise scaling to range [-1,1].
                             'feat01': feature-wise scaling to range [0,1].
                           The default is `feat-11`.
            force_download: If set to True, then download the dataset even if it already exists. Default to False.
            force_clean:    If set to True, then clean the dataset even if it already exists in clean folder. Default to False.
            clean_verbose:  If set to True, will print out which feature being normalized. Default to False.
        """
        # reset
        self.download_url = None
        self.task = None
        self.dataset = None 
        self.download_success = False
        self.clean_success = False
        self._parseInputs(task, dataset, download_url)
        if self.download_url is not None:
            self._getData(force_download)
            if self.task in ["binary", "regression"]:
                self._cleanData(normalization, binary_lable, force_clean, clean_verbose)
            else:
                print(f"{_bcolors.WARNING}The clean rule for dataset:{self.dataset} with task:{self.task} is not defined. Hence, no cleaning is performed.{_bcolors.ENDC}")
        else:
            print(f"{_bcolors.WARNING}Fail to generate download url, please check your inputs.{_bcolors.ENDC}")
    def getAndCleanFromFile(self, file_path, task=None, binary_lable='{-1,1}', normalization='feat-11', 
                            force_download=False, force_clean=False, clean_verbose=True):
        """Download all datasets specified in the text file.
        
        Args:
            file_path: A string points to a text file that store the datasets name, where a single dataset name 
                       takes a row. See "libsvm_regression.txt" for example. You can also use "#" to comment the 
                       dataset name to prevent downloading the dataset. The rest arguments please refer to the 
                       `getAndClean` method.
             
        
        Warning: You should be responsible to make sure all datasets specified in the text file match with the 
        `task`. Although some sanity checks are performed, there are still many ways to break the code.
        When the code breaks (i.e., Errors are not caught by my code), chances are high that the dataset name is 
        not specify correctly. For example, you want to work with the "duke" dataset. You should put "duke.bz2" in 
        the text file. This is because the current implementation use the dataset name to generate the download url. 
       
        
        Typical usage example:
        
            file_path = './libsvm_regression.txt'
            libclass.getAndCleanFromFile(file_path,
                                         task='regression', 
                                         normalization='feat-11',
                                         force_download=False, 
                                         force_clean=False, 
                                         clean_verbose=False)
        """
        # parse file
        dataset_lst = []
        with open(file_path, 'r') as f:
            for line in f.readlines():
                temp = line.split("#")
                if len(temp) ==  1:
                    dataset = temp[0].strip()
                    dataset_lst.append(dataset)
        print(f"Parsed {len(dataset_lst)} datasets from {file_path}.")
        download_count = 0
        clean_count = 0
        url = self.task_dict[task]["url"]
        for dataset in dataset_lst:
            self.download_url = url + "/" + dataset
            self.task = task
            self.dataset = dataset
            self.download_success = False
            self.clean_success = False
            self._getData(force_download)
            if self.task in ["binary", "regression"]:
                self._cleanData(normalization, binary_lable, force_clean, clean_verbose)
            else:
                print(f"{_bcolors.WARNING}The clean rule for dataset:{self.dataset} with task:{self.task} is not defined. Hence, no cleaning is performed.{_bcolors.ENDC}")
            if self.download_success:
                download_count += 1
            if self.clean_success:
                clean_count += 1
        msg = "Summary:\n"
        if download_count == len(dataset_lst) and clean_count == len(dataset_lst):
            msg += f"{_bcolors.OKGREEN}Plan to download: [{len(dataset_lst)}] datasets|"\
                   f" Successfully download: [{download_count}] datasets | Successfully clean: [{clean_count}] datasets.{_bcolors.ENDC}"
        else:
            msg += f"{_bcolors.WARNING}Plan to download: {len(dataset_lst)} datasets| "\
                   f"Successfully download: [{download_count}] datasets | Successfully clean: [{clean_count}] datasets.{_bcolors.ENDC}\n"
            msg += "Potential cause is that some datasets are already downloaded and/or cleaned.\n"
            number_of_files_download = glob.glob(f"{self.download_dir}/{task}/*")
            number_of_files_clean = glob.glob(f"{self.cleand_dir}/{task}/*")
            msg += f"[{self.download_dir}/{task}] has [{len(number_of_files_download)}] files and [{self.cleand_dir}/{task}] has [{len(number_of_files_clean)}] files."
        print(msg)

Classes

class LibsvmDataset (download_dir='./raw', cleand_dir='./clean')

Initialize the LibsvmDataset class.

Args

download_dir: A string specifies the place to store the downloaded raw dataset.
cleand_dir: A string specifies the place to store the cleaned dataset.

Expand source code

class LibsvmDataset:
    def __init__(self, download_dir="./raw", 
                 cleand_dir="./clean"):
        """
        Initialize the  LibsvmDataset class.
        
        Args:
            download_dir: A string specifies the place to store the downloaded raw dataset.
            cleand_dir: A string specifies the place to store the cleaned dataset.
        """
        self.download_dir = download_dir
        self.cleand_dir = cleand_dir
        for directory in [self.download_dir, self.cleand_dir]:
            if not os.path.exists(directory):
                os.makedirs(directory)
        self.url_regression = "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/regression"
        self.url_binary = "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary"
        self.data_binary = [
            'a1a', 'a2a', 'a3a', 'a4a', 'a5a', 'a6a', 'a7a', 'a8a', 'a9a',
            'a1a.t', 'a2a.t', 'a3a.t', 'a4a.t', 'a5a.t', 'a6a.t', 'a7a.t', 'a8a.t', 'a9a.t', 
            'australian',
            'breast-cancer',
            'cod-rna', 'cod-rna.t', 'cod-rna.r', 
            'colon-cancer.bz2',
            'covtype.libsvm.binary.bz2',
            'diabetes',
            'duke.bz2',
            'fourclass',
            'german.numer',
            'gisette_scale.bz2', 'gisette_scale.t.bz2',
            'heart',
            'ijcnn1.bz2',
            'ionosphere_scale',
            'leu.bz2', 'leu.bz2.t',
            'liver-disorders', 'liver-disorders.t',
            'mushrooms',
            'phishing',
            'skin_nonskin',
            'splice', 'splice.t',
            'sonar_scale',
            'svmguide1', 'svmguide1.t', 'svmguide3', 'svmguide3.t', 
            'w1a', 'w2a', 'w3a', 'w4a', 'w5a', 'w6a', 'w7a', 'w8a',
            'w1a.t', 'w2a.t', 'w3a.t', 'w4a.t', 'w5a.t', 'w6a.t', 'w7a.t', 'w8a.t'
             #'epsilon_normalized.bz2', 'epsilon_normalized.t.bz2'
             #'HIGGS.bz2',
             #'madelon', 'madelon.t',
             #'news20.binary.bz2',
             #'rcv1_train.binary.bz2','rcv1_test.binary.bz2',
             #'real-sim.bz2',
        ]        
        self.data_regression = [
            'abalone',
            'bodyfat',
            'cadata',
            'cpusmall',
            'log1p.E2006.train.bz2', 'log1p.E2006.test.bz2 '
            'E2006.train.bz2', 'E2006.test.bz2',
            'eunite2001', 'eunite2001.t', 'eunite2001.m',
            'housing',
            'mg',
            'mpg',
            'pyrim',
            'space_ga',
            'triazines',
            'YearPredictionMSD.bz2', 'YearPredictionMSD.t.bz2'
        ]       
        self.task_dict = {"binary":{"url":self.url_binary, 
                                    "dataset":self.data_binary}, 
                          "regression":{"url":self.url_regression, 
                                        "dataset":self.data_regression}}  
        # for printing
        self.pbar = None
        
    def _show_progress(self, block_num, block_size, total_size):
        """
            private function. Show the progress of urlretrieve for downloading data.
        """
        if self.pbar is None:
            self.pbar = progressbar.ProgressBar(maxval=total_size)
            self.pbar.start()

        downloaded = block_num * block_size
        if downloaded < total_size:
            self.pbar.update(downloaded)
        else:
            self.pbar.finish()
            self.pbar = None
    
    def _parseInputs(self, task=None, dataset=None, download_url=None):
        if task is not None and dataset is not None:
            print("You choose to use the task+dataset option.")
            try:
                work_dict = self.task_dict[task]
            except KeyError:
                print(f"{_bcolors.WARNING}Warning:Your input taks is [{task}], which currently is not supported.\n"\
                      f"However, you can provide an url pointing to the desired dataset to download it.{_bcolors.ENDC}")
                return
            is_available = dataset in work_dict["dataset"]
            if not is_available:
                print(f"{_bcolors.FAIL}Error occurs!\n"\
                     f"  1.Either the input dataset:[{dataset}] is not intended for the task:[{task}].\n"\
                     f"  2.Or the input dataset:[{dataset}] is not supported.\n"\
                     f"If you are sure the latter case happens, you can provide an url pointing to the desired dataset.{_bcolors.ENDC}"
                     )
                return
            self.download_url = work_dict["url"] + "/" + dataset
            self.task = task
            self.dataset = dataset
        elif download_url:
            print("You choose to use the url option.")
            try:
                task, dataset = download_url.split("/")[-2], download_url.split("/")[-1]
                self.download_url = download_url
                self.task = task
                self.dataset = dataset
            except IndexError:
                self.download_url = None
                print(f"{_bcolors.FAIL}The input url {download_url} is wrong.{_bcolors.ENDC}")
        else:
            raise ValueError(f"{_bcolors.FAIL}Code has bugs.{_bcolors.ENDC}")
        if self.download_url:
            print(f"Parsed task: [{self.task}] | Parsed dataset: [{self.dataset}]\nParsed download_url:[{self.download_url}]")
    def _getData(self, force_download):
        """
            Use urllib.request::urlretrieve to download the self.dataset and save to 
            self.download_dir/self.task based on the self.download_url.
            If the dataset already exists, then it will skip. However, you can force download
            by setting force_download=True.
        """
        if self.download_url is not None:
            # check whether the dataset is being downloaded or not
            directory = f"{self.download_dir}/{self.task}"
            if not os.path.exists(directory):
                os.makedirs(directory)
            is_downloaded = os.path.exists(f"{directory}/{self.dataset}")
            if not is_downloaded or force_download:
                try:
                    urlretrieve(self.download_url, f'{directory}/{self.dataset}', self._show_progress)
                except HTTPError:
                    print(f"{_bcolors.FAIL}The input or parsed url {self.download_url} is wrong.\n"\
                          f"Possible cause is that either the dataset:[{self.dataset}] or the task:[{self.task}]"\
                          f"is not specified correctly.{_bcolors.ENDC}")
                #print("Start downloading... It may take a while")
                #subprocess.run(['wget', '-i', self.download_url, '-P', self.download_dir, 
                #                '-O', f'{self.download_dir}/{self.dataset}'])
                if os.path.exists(f"{directory}/{self.dataset}"):
                    self.download_success = True
                    print(f"{_bcolors.OKGREEN}dataset [{self.dataset}] is downloaded at [{directory}].{_bcolors.ENDC}")
            else:
                print(f"{_bcolors.WARNING}The dataset [{self.dataset}] already exists in [{directory}]!{_bcolors.ENDC}")
    def _cleanData(self, normalization, binary_label, force_clean, clean_verbose):
        """
            Load the raw dataset from self.download_dir.
            If normalization is set, it will perform appropriate normalization.
            See docstring of the getAndClean to find valid values.
            
            If the cleaned dataset already exists, then it will skip. 
            However, you can force cleaning the dataset by setting force_clean=True.
        """
        directory = f"{self.cleand_dir}/{self.task}"
        if not os.path.exists(directory):
            os.makedirs(directory)
        is_cleaned = os.path.exists(f"{directory}/{self.dataset}")
        if self.download_success:
            if not is_cleaned or force_clean:
                try:
                    data = load_svmlight_file(f"{self.download_dir}/{self.task}/{self.dataset}")
                except ValueError:
                    print(f"{_bcolors.WARNING}Although, you alredy download the data file, the input or parsed url\n"\
                          f"{self.download_url} is wrong. The chance are that you miss the file extension.\n"\
                          f"For exmaple, you should use duke.bz2 instead of the duke for the dataset argument.\n"
                          f"Please check your input dataset:[{self.dataset}].{_bcolors.ENDC}")
                    os.remove(f"{self.download_dir}/{self.task}/{self.dataset}")
                    print(f"{_bcolors.WARNING}Therefore, no cleaning is performed and the file in {self.download_dir}/{self.task}/{self.dataset} is removed!{_bcolors.ENDC}")
                    return
                X, y= data[0], data[1]
                n, p = X.shape
                # check label for the binary task
                if self.task == 'binary':
                    y1old, y2old = np.unique(y)
                    if binary_label is not None:
                        if  binary_label=='{-1,1}':
                            y1new, y2new = -1.0, 1.0
                        elif binary_label=='{0,1}':
                            y1new, y2new = 0.0, 1.0
                        else:
                            raise ValueError(f"Unrecognized binary_level: {binary_label}")
                        y[y==y1old] = y1new
                        y[y==y2old] = y2new
                        print(f"Original y-label range: {{ {y1old}, {y2old} }} -> New y-label range: {{ {np.unique(y)[0]}, {np.unique(y)[1]} }}")
                    else:
                        raise ValueError(f"{_bcolors.FAIL}You should set the desired binary_level.\n For example, binary_label='[-1,1]'.{_bcolors.ENDC}")
                # check feature range
                if normalization is not None:
                    print(f"Perform normalization:{normalization}")
                    if normalization == 'feat-11':
                        for i in range(p):
                            temp = X[:,i]
                            if np.max(temp) > 1.0 or np.min(temp) < -1.0:
                                X[:,i] /= np.max(np.abs(temp))
                                if clean_verbose:
                                    print(f"  col:{i}: max:{np.max(temp):3.3e} | min:{np.min(temp):3.3e}\n"\
                                           "  Apply feature-wise [-1,1] scaling...")
                    elif normalization == 'feat01':
                        for i in range(p):
                            temp = X[:,i]
                            xmax, xmin = np.max(temp), np.min(temp)
                            if xmax > 1.0 or  xmin < 0.0:
                                X[:,i] = (X[:,i] - xmin) / (xmax - xmin)
                                if clean_verbose:
                                    print(f"  col:{i}: max:{np.max(temp):3.3e} | min:{np.min(temp):3.3e}\n"\
                                           "  Apply feature-wise [0,1] scaling...")
                    else:
                        raise ValueError(f"{_bcolors.FAIL}Unrecognized normalization: {normalization}{_bcolors.ENDC}")
                dump_svmlight_file(X, y, f"{directory}/{self.dataset}")
                if os.path.exists(f"{directory}/{self.dataset}"):
                    self.clean_success = True
                    print(f"{_bcolors.OKGREEN}Success: File saved at [{directory}]!{_bcolors.ENDC}")
                    print("-*"*30)
            else:
                 print(f"{_bcolors.WARNING}The cleaned dataset [{self.dataset}] already exists in [{directory}]!{_bcolors.ENDC}")
        else:
            if not is_cleaned:
                print(f"{_bcolors.WARNING}The dataset [{self.dataset}] does not exist in [{self.download_dir}/{self.task}]! No cleaning is performed.{_bcolors.ENDC}")
    def getAvailable(self):
            """Show supported tasks and for each supported task show available datasets.
            Typical usage example:
            
                   libsvm = LibsvmDataset()
                   libsvm.getAvailable()
            """
            print("Current supported tasks are:")
            for k in self.task_dict.keys():
                print(f" ['{k}']", end="")
            print("\n=====================================")
            for k in self.task_dict.keys():
                print(f"For task:['{k}'], available datasets are:")
                print("----------------------------------------------------")
                dataset_lst = []
                for i,d in enumerate(self.task_dict[k]["dataset"]):
                    print(f" '{d}'", end=",")
                    if (i +1) % 5 == 0:
                        print("\n")
                print("\n")        
            print("\n")
    def getAndClean(self, task=None, dataset=None, download_url=None, 
                    binary_lable='{-1,1}', normalization='feat-11',
                    force_download=False, force_clean=False, clean_verbose=True
                   ):
        """Download and clean the dataset.
        Typical usage example:

              libsvm = LibsvmDataset()
              # usage 1
              libsvm.getAndClean(task="binary", dataset="a1a", binary_lable='{-1,1}', normalization='feat-11')
              # usage 2
              libsvm.getAndClean(task="regression", dataset="abalone", normalization='feat-11')
              # usage 3
              libsvm.getAndClean(url='https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/avazu-app.bz2',
                                 binary_lable='{-1,1}', normalization='feat-11')    
        Warning:
            To get the correct dataset name, one can either use `getAvailable` method to see all available datasets.
            Or visit https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/ and get the name of desired datasets.
            For example, given the download link: https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/avazu-app.bz2
            the dataset name is avazu-app.bz2 (please keep the .bz2 extension).
        
        Args:

            task: A string specifies the task you wish to perform. Currently supported {'binary', 'regression'}.
            dataset: A string specifies the dataset you want to download. Use `getAvailable` method to show all
                     currently available datasets for any given task.
            download_url: If the desired dataset is not provided for a given task, one can directly provide a url 
                          link to the desired data set. For example, one wants to download the avazu dataset.
                          One can visit https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary.html#avazu.
                          If you want to download the "avazu-app.bz2" instance, you can right-click its name and 
                          select "copy link", then you should get a plain text as
                              <https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/avazu-app.bz2>
                          Then just provides it as a string.
            binary_label: If you want to perform binary classification, one can set labels to {-1,1} by providing
                          '{-1,1}'; or set labels to {0,1} by providing '{0,1}'. The default is '{-1,1}'.
            normalization: Perform feature-wise normalization. Currently supported options:
                             'feat-11': feature-wise scaling to range [-1,1].
                             'feat01': feature-wise scaling to range [0,1].
                           The default is `feat-11`.
            force_download: If set to True, then download the dataset even if it already exists. Default to False.
            force_clean:    If set to True, then clean the dataset even if it already exists in clean folder. Default to False.
            clean_verbose:  If set to True, will print out which feature being normalized. Default to False.
        """
        # reset
        self.download_url = None
        self.task = None
        self.dataset = None 
        self.download_success = False
        self.clean_success = False
        self._parseInputs(task, dataset, download_url)
        if self.download_url is not None:
            self._getData(force_download)
            if self.task in ["binary", "regression"]:
                self._cleanData(normalization, binary_lable, force_clean, clean_verbose)
            else:
                print(f"{_bcolors.WARNING}The clean rule for dataset:{self.dataset} with task:{self.task} is not defined. Hence, no cleaning is performed.{_bcolors.ENDC}")
        else:
            print(f"{_bcolors.WARNING}Fail to generate download url, please check your inputs.{_bcolors.ENDC}")
    def getAndCleanFromFile(self, file_path, task=None, binary_lable='{-1,1}', normalization='feat-11', 
                            force_download=False, force_clean=False, clean_verbose=True):
        """Download all datasets specified in the text file.
        
        Args:
            file_path: A string points to a text file that store the datasets name, where a single dataset name 
                       takes a row. See "libsvm_regression.txt" for example. You can also use "#" to comment the 
                       dataset name to prevent downloading the dataset. The rest arguments please refer to the 
                       `getAndClean` method.
             
        
        Warning: You should be responsible to make sure all datasets specified in the text file match with the 
        `task`. Although some sanity checks are performed, there are still many ways to break the code.
        When the code breaks (i.e., Errors are not caught by my code), chances are high that the dataset name is 
        not specify correctly. For example, you want to work with the "duke" dataset. You should put "duke.bz2" in 
        the text file. This is because the current implementation use the dataset name to generate the download url. 
       
        
        Typical usage example:
        
            file_path = './libsvm_regression.txt'
            libclass.getAndCleanFromFile(file_path,
                                         task='regression', 
                                         normalization='feat-11',
                                         force_download=False, 
                                         force_clean=False, 
                                         clean_verbose=False)
        """
        # parse file
        dataset_lst = []
        with open(file_path, 'r') as f:
            for line in f.readlines():
                temp = line.split("#")
                if len(temp) ==  1:
                    dataset = temp[0].strip()
                    dataset_lst.append(dataset)
        print(f"Parsed {len(dataset_lst)} datasets from {file_path}.")
        download_count = 0
        clean_count = 0
        url = self.task_dict[task]["url"]
        for dataset in dataset_lst:
            self.download_url = url + "/" + dataset
            self.task = task
            self.dataset = dataset
            self.download_success = False
            self.clean_success = False
            self._getData(force_download)
            if self.task in ["binary", "regression"]:
                self._cleanData(normalization, binary_lable, force_clean, clean_verbose)
            else:
                print(f"{_bcolors.WARNING}The clean rule for dataset:{self.dataset} with task:{self.task} is not defined. Hence, no cleaning is performed.{_bcolors.ENDC}")
            if self.download_success:
                download_count += 1
            if self.clean_success:
                clean_count += 1
        msg = "Summary:\n"
        if download_count == len(dataset_lst) and clean_count == len(dataset_lst):
            msg += f"{_bcolors.OKGREEN}Plan to download: [{len(dataset_lst)}] datasets|"\
                   f" Successfully download: [{download_count}] datasets | Successfully clean: [{clean_count}] datasets.{_bcolors.ENDC}"
        else:
            msg += f"{_bcolors.WARNING}Plan to download: {len(dataset_lst)} datasets| "\
                   f"Successfully download: [{download_count}] datasets | Successfully clean: [{clean_count}] datasets.{_bcolors.ENDC}\n"
            msg += "Potential cause is that some datasets are already downloaded and/or cleaned.\n"
            number_of_files_download = glob.glob(f"{self.download_dir}/{task}/*")
            number_of_files_clean = glob.glob(f"{self.cleand_dir}/{task}/*")
            msg += f"[{self.download_dir}/{task}] has [{len(number_of_files_download)}] files and [{self.cleand_dir}/{task}] has [{len(number_of_files_clean)}] files."
        print(msg)

Methods

def getAndClean(self, task=None, dataset=None, download_url=None, binary_lable='{-1,1}', normalization='feat-11', force_download=False, force_clean=False, clean_verbose=True)

Download and clean the dataset. Typical usage example:

  libsvm = LibsvmDataset()
  # usage 1
  libsvm.getAndClean(task="binary", dataset="a1a", binary_lable='{-1,1}', normalization='feat-11')
  # usage 2
  libsvm.getAndClean(task="regression", dataset="abalone", normalization='feat-11')
  # usage 3
  libsvm.getAndClean(url='https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/avazu-app.bz2',
                     binary_lable='{-1,1}', normalization='feat-11')

Warning

To get the correct dataset name, one can either use getAvailable method to see all available datasets. Or visit https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/ and get the name of desired datasets. For example, given the download link: https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/avazu-app.bz2 the dataset name is avazu-app.bz2 (please keep the .bz2 extension).

Args

task: A string specifies the task you wish to perform. Currently supported {'binary', 'regression'}.
dataset: A string specifies the dataset you want to download. Use getAvailable method to show all currently available datasets for any given task.
download_url: If the desired dataset is not provided for a given task, one can directly provide a url link to the desired data set. For example, one wants to download the avazu dataset. One can visit https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary.html#avazu. If you want to download the "avazu-app.bz2" instance, you can right-click its name and select "copy link", then you should get a plain text as https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/avazu-app.bz2 Then just provides it as a string.
binary_label: If you want to perform binary classification, one can set labels to {-1,1} by providing '{-1,1}'; or set labels to {0,1} by providing '{0,1}'. The default is '{-1,1}'.
normalization: Perform feature-wise normalization. Currently supported options: 'feat-11': feature-wise scaling to range [-1,1]. 'feat01': feature-wise scaling to range [0,1]. The default is feat-11.
force_download: If set to True, then download the dataset even if it already exists. Default to False.
force_clean: If set to True, then clean the dataset even if it already exists in clean folder. Default to False.
clean_verbose: If set to True, will print out which feature being normalized. Default to False.

Expand source code

def getAndClean(self, task=None, dataset=None, download_url=None, 
                binary_lable='{-1,1}', normalization='feat-11',
                force_download=False, force_clean=False, clean_verbose=True
               ):
    """Download and clean the dataset.
    Typical usage example:

          libsvm = LibsvmDataset()
          # usage 1
          libsvm.getAndClean(task="binary", dataset="a1a", binary_lable='{-1,1}', normalization='feat-11')
          # usage 2
          libsvm.getAndClean(task="regression", dataset="abalone", normalization='feat-11')
          # usage 3
          libsvm.getAndClean(url='https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/avazu-app.bz2',
                             binary_lable='{-1,1}', normalization='feat-11')    
    Warning:
        To get the correct dataset name, one can either use `getAvailable` method to see all available datasets.
        Or visit https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/ and get the name of desired datasets.
        For example, given the download link: https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/avazu-app.bz2
        the dataset name is avazu-app.bz2 (please keep the .bz2 extension).
    
    Args:

        task: A string specifies the task you wish to perform. Currently supported {'binary', 'regression'}.
        dataset: A string specifies the dataset you want to download. Use `getAvailable` method to show all
                 currently available datasets for any given task.
        download_url: If the desired dataset is not provided for a given task, one can directly provide a url 
                      link to the desired data set. For example, one wants to download the avazu dataset.
                      One can visit https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary.html#avazu.
                      If you want to download the "avazu-app.bz2" instance, you can right-click its name and 
                      select "copy link", then you should get a plain text as
                          <https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/avazu-app.bz2>
                      Then just provides it as a string.
        binary_label: If you want to perform binary classification, one can set labels to {-1,1} by providing
                      '{-1,1}'; or set labels to {0,1} by providing '{0,1}'. The default is '{-1,1}'.
        normalization: Perform feature-wise normalization. Currently supported options:
                         'feat-11': feature-wise scaling to range [-1,1].
                         'feat01': feature-wise scaling to range [0,1].
                       The default is `feat-11`.
        force_download: If set to True, then download the dataset even if it already exists. Default to False.
        force_clean:    If set to True, then clean the dataset even if it already exists in clean folder. Default to False.
        clean_verbose:  If set to True, will print out which feature being normalized. Default to False.
    """
    # reset
    self.download_url = None
    self.task = None
    self.dataset = None 
    self.download_success = False
    self.clean_success = False
    self._parseInputs(task, dataset, download_url)
    if self.download_url is not None:
        self._getData(force_download)
        if self.task in ["binary", "regression"]:
            self._cleanData(normalization, binary_lable, force_clean, clean_verbose)
        else:
            print(f"{_bcolors.WARNING}The clean rule for dataset:{self.dataset} with task:{self.task} is not defined. Hence, no cleaning is performed.{_bcolors.ENDC}")
    else:
        print(f"{_bcolors.WARNING}Fail to generate download url, please check your inputs.{_bcolors.ENDC}")

def getAndCleanFromFile(self, file_path, task=None, binary_lable='{-1,1}', normalization='feat-11', force_download=False, force_clean=False, clean_verbose=True)

Download all datasets specified in the text file.

Args

file_path: A string points to a text file that store the datasets name, where a single dataset name takes a row. See "libsvm_regression.txt" for example. You can also use "#" to comment the dataset name to prevent downloading the dataset. The rest arguments please refer to the getAndClean method.

Warning: You should be responsible to make sure all datasets specified in the text file match with the task. Although some sanity checks are performed, there are still many ways to break the code. When the code breaks (i.e., Errors are not caught by my code), chances are high that the dataset name is not specify correctly. For example, you want to work with the "duke" dataset. You should put "duke.bz2" in the text file. This is because the current implementation use the dataset name to generate the download url.

Typical usage example:

file_path = './libsvm_regression.txt'
libclass.getAndCleanFromFile(file_path,
                             task='regression', 
                             normalization='feat-11',
                             force_download=False, 
                             force_clean=False, 
                             clean_verbose=False)

Expand source code

def getAndCleanFromFile(self, file_path, task=None, binary_lable='{-1,1}', normalization='feat-11', 
                        force_download=False, force_clean=False, clean_verbose=True):
    """Download all datasets specified in the text file.
    
    Args:
        file_path: A string points to a text file that store the datasets name, where a single dataset name 
                   takes a row. See "libsvm_regression.txt" for example. You can also use "#" to comment the 
                   dataset name to prevent downloading the dataset. The rest arguments please refer to the 
                   `getAndClean` method.
         
    
    Warning: You should be responsible to make sure all datasets specified in the text file match with the 
    `task`. Although some sanity checks are performed, there are still many ways to break the code.
    When the code breaks (i.e., Errors are not caught by my code), chances are high that the dataset name is 
    not specify correctly. For example, you want to work with the "duke" dataset. You should put "duke.bz2" in 
    the text file. This is because the current implementation use the dataset name to generate the download url. 
   
    
    Typical usage example:
    
        file_path = './libsvm_regression.txt'
        libclass.getAndCleanFromFile(file_path,
                                     task='regression', 
                                     normalization='feat-11',
                                     force_download=False, 
                                     force_clean=False, 
                                     clean_verbose=False)
    """
    # parse file
    dataset_lst = []
    with open(file_path, 'r') as f:
        for line in f.readlines():
            temp = line.split("#")
            if len(temp) ==  1:
                dataset = temp[0].strip()
                dataset_lst.append(dataset)
    print(f"Parsed {len(dataset_lst)} datasets from {file_path}.")
    download_count = 0
    clean_count = 0
    url = self.task_dict[task]["url"]
    for dataset in dataset_lst:
        self.download_url = url + "/" + dataset
        self.task = task
        self.dataset = dataset
        self.download_success = False
        self.clean_success = False
        self._getData(force_download)
        if self.task in ["binary", "regression"]:
            self._cleanData(normalization, binary_lable, force_clean, clean_verbose)
        else:
            print(f"{_bcolors.WARNING}The clean rule for dataset:{self.dataset} with task:{self.task} is not defined. Hence, no cleaning is performed.{_bcolors.ENDC}")
        if self.download_success:
            download_count += 1
        if self.clean_success:
            clean_count += 1
    msg = "Summary:\n"
    if download_count == len(dataset_lst) and clean_count == len(dataset_lst):
        msg += f"{_bcolors.OKGREEN}Plan to download: [{len(dataset_lst)}] datasets|"\
               f" Successfully download: [{download_count}] datasets | Successfully clean: [{clean_count}] datasets.{_bcolors.ENDC}"
    else:
        msg += f"{_bcolors.WARNING}Plan to download: {len(dataset_lst)} datasets| "\
               f"Successfully download: [{download_count}] datasets | Successfully clean: [{clean_count}] datasets.{_bcolors.ENDC}\n"
        msg += "Potential cause is that some datasets are already downloaded and/or cleaned.\n"
        number_of_files_download = glob.glob(f"{self.download_dir}/{task}/*")
        number_of_files_clean = glob.glob(f"{self.cleand_dir}/{task}/*")
        msg += f"[{self.download_dir}/{task}] has [{len(number_of_files_download)}] files and [{self.cleand_dir}/{task}] has [{len(number_of_files_clean)}] files."
    print(msg)

def getAvailable(self)

Show supported tasks and for each supported task show available datasets. Typical usage example:

   libsvm = LibsvmDataset()
   libsvm.getAvailable()

Expand source code

def getAvailable(self):
        """Show supported tasks and for each supported task show available datasets.
        Typical usage example:
        
               libsvm = LibsvmDataset()
               libsvm.getAvailable()
        """
        print("Current supported tasks are:")
        for k in self.task_dict.keys():
            print(f" ['{k}']", end="")
        print("\n=====================================")
        for k in self.task_dict.keys():
            print(f"For task:['{k}'], available datasets are:")
            print("----------------------------------------------------")
            dataset_lst = []
            for i,d in enumerate(self.task_dict[k]["dataset"]):
                print(f" '{d}'", end=",")
                if (i +1) % 5 == 0:
                    print("\n")
            print("\n")        
        print("\n")