Source code for soundata.datasets.esc50

"""ESC-50 Dataset Loader

.. admonition:: Dataset Info
    :class: dropdown

    **ESC-50: Dataset for Environmental Sound Classification**

    The ESC-50 dataset is a labeled collection of 2000 environmental audio recordings suitable for benchmarking methods of environmental sound classification.
    The total duration of the dataset is 2.8 hours (2000 x 5 seconds).

    The dataset consists of 5-second-long recordings organized into 50 semantical classes (with 40 examples per class) loosely arranged into 5 major categories:

    Animals	        Natural soundscapes & water sounds	    Human, non-speech sounds	Interior/domestic sounds	Exterior/urban noises
    Dog	            Rain	                                Crying baby	                Door knock	                Helicopter
    Rooster	        Sea waves	                            Sneezing	                Mouse click             	Chainsaw
    Pig	            Crackling fire	                        Clapping                	Keyboard typing	            Siren
    Cow	            Crickets	                            Breathing	                Door, wood creaks	        Car horn
    Frog	        Chirping birds	                        Coughing	                Can opening             	Engine
    Cat     	    Water drops                     	    Footsteps               	Washing machine	            Train
    Hen     	    Wind                                	Laughing	                Vacuum cleaner	            Church bells
    Insects (flying)Pouring water	                        Brushing teeth	            Clock alarm             	Airplane
    Sheep	        Toilet flush                        	Snoring	                    Clock tick              	Fireworks
    Crow	        Thunderstorm	                        Drinking, sipping	        Glass breaking	            Hand saw
    
    Clips in this dataset have been manually extracted from public field recordings gathered by the Freesound.org project. 
    The dataset has been prearranged into 5 folds for comparable cross-validation, making sure that fragments from the same original source file are contained in a single fold.

    A more thorough description of the dataset is available in the original paper with some supplementary materials on GitHub: 
    
    .. code-block:: latex

        K. J. Piczak. ESC: Dataset for Environmental Sound Classification. Proceedings of the 23rd Annual ACM Conference on Multimedia, Brisbane, Australia, 2015.

    https://github.com/karolpiczak/ESC-50

    Repository content
    audio/<audio_name>.wav

    2000 audio recordings in WAV format (5 seconds, 44.1 kHz, mono) with the following naming convention:

    {FOLD}-{CLIP_ID}-{TAKE}-{TARGET}.wav

    {FOLD} - index of the cross-validation fold,
    {CLIP_ID} - ID of the original Freesound clip,
    {TAKE} - letter disambiguating between different fragments from the same Freesound clip,
    {TARGET} - class in numeric format [0, 49].
    meta/esc50.csv

    CSV file with the following structure:

    filename	fold	target	category	esc10	src_file	take
    
    The esc10 column indicates if a given file belongs to the ESC-10 subset (10 selected classes, CC BY license).

    https://github.com/karolpiczak/ESC-50/blob/master/meta/esc50-human.xlsx

    Additional data pertaining to the crowdsourcing experiment (human classification accuracy).

"""

import os
from typing import BinaryIO, Optional, Tuple

import librosa
import numpy as np
import csv

from soundata import download_utils, jams_utils, core, annotations, io


BIBTEX = """
@inproceedings{piczak2015dataset,
  title = {{ESC}: {Dataset} for {Environmental Sound Classification}},
  author = {Piczak, Karol J.},
  booktitle = {Proceedings of the 23rd {Annual ACM Conference} on {Multimedia}},
  date = {2015-10-13},
  url = {http://dl.acm.org/citation.cfm?doid=2733373.2806390},
  doi = {10.1145/2733373.2806390},
  location = {{Brisbane, Australia}},
  isbn = {978-1-4503-3459-4},
  publisher = {{ACM Press}},
  pages = {1015--1018}
}
"""
REMOTES = {
    "all": download_utils.RemoteFileMetadata(
        filename="ESC-50-master.zip",
        url="https://github.com/karoldvl/ESC-50/archive/master.zip",
        checksum="7771e4b9d86d0945acce719c7a59305a",
        unpack_directories=["ESC-50-master"],
    )
}

LICENSE_INFO = "Creative Commons Attribution-NonCommercial 3.0 Unported (CC BY-NC 3.0)"


[docs]class Clip(core.Clip):
    """ESC-50 Clip class

    Args:
        clip_id (str): id of the clip

    Attributes:
        audio (np.ndarray, float): path to the audio file
        audio_path (str): path to the audio file
        category (str): clip class in string format, i.e., label
        clip_id (str): clip id
        esc10 (bool): True if the clip belongs to the ESC-10 subset (10 selected classes, CC BY license)
        filename (str): clip filename
        fold (int): index of the cross-validation fold the clip belongs to
        src_file (str): freesound ID of the original file from which the clip was taken
        tags (soundata.annotations.Tags): tag (label) of the clip + confidence. In ESC-50 every clip has one tag.
        take (str): letter disambiguating between different fragments from the same Freesound clip (e.g., "A", "B", etc.)
        target (int): clip class in numeric format
    """

    def __init__(self, clip_id, data_home, dataset_name, index, metadata):
        super().__init__(clip_id, data_home, dataset_name, index, metadata)

        self.audio_path = self.get_path("audio")

    @property
    def audio(self) -> Optional[Tuple[np.ndarray, float]]:
        """The clip's audio

        Returns:
            * np.ndarray - audio signal
            * float - sample rate

        """
        return load_audio(self.audio_path)

    @property
    def filename(self):
        """The clip's filename

        Returns:
            * str - clip filename
        """
        return self._clip_metadata.get("filename")

    @property
    def fold(self):
        """The clip's fold

        Returns:
            * int - index of the cross-validation fold the clip belongs to

        """
        return self._clip_metadata.get("fold")

    @property
    def target(self):
        """The clip's target.

        Returns:
            * int - clip class in numeric format

        """
        return self._clip_metadata.get("target")

    @property
    def category(self):
        """The clip's category.

        Returns:
            * str - clip class in string format, i.e., label

        """
        return self._clip_metadata.get("category")

    @property
    def esc10(self):
        """The clip's esc10.

        Returns:
            * bool - True if the clip belongs to the ESC-10 subset (10 selected classes, CC BY license)

        """
        return self._clip_metadata.get("esc10")

    @property
    def src_file(self):
        """The clip's source file.

        Returns:
            * str - freesound ID of the original file from which the clip was taken

        """
        return self._clip_metadata.get("src_file")

    @property
    def take(self):
        """The clip's take

        Returns:
            * str - letter disambiguating between different fragments from the same Freesound clip (e.g., "A", "B", etc.)

        """
        return self._clip_metadata.get("take")

    @property
    def tags(self):
        """The clip's audio

        Returns:
            * np.ndarray - audio signal
            * float - sample rate

        """
        return annotations.Tags(
            [self._clip_metadata.get("category")], "open", np.array([1.0])
        )

[docs]    def to_jams(self):
        """Get the clip's data in jams format

        Returns:
            jams.JAMS: the clip's data in jams format

        """
        return jams_utils.jams_converter(
            audio_path=self.audio_path, tags=self.tags, metadata=self._clip_metadata
        )


[docs]@io.coerce_to_bytes_io
def load_audio(fhandle: BinaryIO, sr=None) -> Tuple[np.ndarray, float]:
    """Load an ESC-50 audio file

    Args:
        fhandle (str or file-like): File-like object or path to audio file
        sr (int or None): sample rate for loaded audio, None by default,
            which loads the file using its original sample rate of 44100.

    Returns:
        * np.ndarray - the mono audio signal
        * float - The sample rate of the audio file

    """
    audio, sr = librosa.load(fhandle, sr=sr, mono=True)
    return audio, sr


[docs]@core.docstring_inherit(core.Dataset)
class Dataset(core.Dataset):
    """The ESC-50 dataset"""

    def __init__(self, data_home=None):
        super().__init__(
            data_home,
            name="esc50",
            clip_class=Clip,
            bibtex=BIBTEX,
            remotes=REMOTES,
            license_info=LICENSE_INFO,
        )

[docs]    @core.copy_docs(load_audio)
    def load_audio(self, *args, **kwargs):
        return load_audio(*args, **kwargs)

    @core.cached_property
    def _metadata(self):
        metadata_path = os.path.join(self.data_home, "meta", "esc50.csv")

        if not os.path.exists(metadata_path):
            raise FileNotFoundError("Metadata not found. Did you run .download()?")

        with open(metadata_path, "r") as fhandle:
            reader = csv.reader(fhandle, delimiter=",")
            raw_data = []
            for line in reader:
                if line[0] != "filename":
                    raw_data.append(line)

        metadata_index = {}
        for line in raw_data:
            clip_id = line[0].replace(".wav", "")

            metadata_index[clip_id] = {
                "filename": line[0],
                "fold": int(line[1]),
                "target": int(line[2]),
                "category": line[3],
                "esc10": True if line[4] == "True" else False,
                "src_file": line[5],
                "take": line[6],
            }

        return metadata_index