File size: 2,613 Bytes
# coding=utf-8
# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""BANKING77 dataset."""

import json

import datasets
from datasets.tasks import TextClassification

_TRAIN_DOWNLOAD_URL = "https://raw.kgithub.com/freeziyou/test_data/main/data/train/train.json"
_TEST_DOWNLOAD_URL = "https://raw.kgithub.com/freeziyou/test_data/main/data/test/test.json"


class Data(datasets.GeneratorBasedBuilder):
    VERSION = datasets.Version("1.0.0")

    def _info(self):
        return datasets.DatasetInfo(
            description=None,
            features=datasets.Features(
                {
                    "text": datasets.Value("string"),
                    "label": datasets.features.ClassLabel(names=[
                        "none",
                        "like",
                        "unlike",
                        "hope",
                        "questioning",
                        "express_surprise",
                        "normal_interaction",
                        "express_sad",
                        "tease",
                        "meme",
                        "express_abashed"
                    ])
                }
            ),
            homepage=None,
            citation=None,
        )

    def _split_generators(self, dl_manager):
        """Returns SplitGenerators."""
        train_path = dl_manager.download_and_extract(_TRAIN_DOWNLOAD_URL)
        test_path = dl_manager.download_and_extract(_TEST_DOWNLOAD_URL)
        return [
            datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs={"filepath": train_path}),
            datasets.SplitGenerator(name=datasets.Split.TEST, gen_kwargs={"filepath": test_path}),
        ]

    def _generate_examples(self, filepath):
        """Yields examples as (key, example) tuples."""
        with open(filepath, encoding="utf-8") as f:
            data = json.load(f)
            for id_, row in data:
                text, label = row['text'], row['label']
                yield id_, {"text": text, "label": label}