File size: 4,218 Bytes
05daa8e
9627e38
c143e76
d96e79d
7d373bd
 
c143e76
7d373bd
d96e79d
bb49074
c143e76
05daa8e
9627e38
880a98d
 
 
 
 
 
 
 
 
 
 
 
 
05daa8e
 
c143e76
 
47aa39b
c143e76
 
 
 
 
 
d96e79d
da70c80
 
 
 
 
 
 
 
 
 
 
bb49074
47aa39b
c143e76
 
 
 
 
 
da70c80
c143e76
 
 
 
da70c80
c143e76
da70c80
c143e76
da70c80
880a98d
bb49074
 
61c510a
7d373bd
8ffc0c7
 
bf76396
 
7d373bd
 
 
 
 
bb49074
 
880a98d
 
bb49074
 
bf76396
 
 
7d373bd
 
bf76396
 
61c510a
880a98d
 
 
 
 
 
 
 
61c510a
 
 
 
 
7d373bd
d96e79d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
from itertools import count, islice
from typing import Any, Iterable, TypeVar

import gradio as gr
import requests
import pandas as pd
from datasets import Features
from gradio_huggingfacehub_search import HuggingfaceHubSearch

from analyze import analyzer, get_column_description, get_columns_with_strings, presidio_scan_entities

MAX_ROWS = 100
T = TypeVar("T")
DEFAULT_PRESIDIO_ENTITIES = sorted([
    'PERSON',
    'CREDIT_CARD',
    'US_SSN',
    'US_DRIVER_LICENSE',
    'PHONE_NUMBER',
    'US_PASSPORT',
    'EMAIL_ADDRESS',
    'IP_ADDRESS',
    'US_BANK_NUMBER',
    'IBAN_CODE',
    'EMAIL',
])

def stream_rows(dataset: str, config: str, split: str) -> Iterable[dict[str, Any]]:
    batch_size = 100
    for i in count():
        rows_resp = requests.get(f"https://datasets-server.huggingface.co/rows?dataset={dataset}&config={config}&split={split}&offset={i * batch_size}&length={batch_size}", timeout=20).json()
        if "error" in rows_resp:
            raise RuntimeError(rows_resp["error"])
        if not rows_resp["rows"]:
            break
        for row_item in rows_resp["rows"]:
            yield row_item["row"]

class track_iter:

    def __init__(self, it: Iterable[T]):
        self.it = it
        self.next_idx = 0

    def __iter__(self) -> T:
        for item in self.it:
            self.next_idx += 1
            yield item

def analyze_dataset(dataset: str, enabled_presidio_entities: str) -> pd.DataFrame:
    info_resp = requests.get(f"https://datasets-server.huggingface.co/info?dataset={dataset}", timeout=3).json()
    if "error" in info_resp:
        yield "❌ " + info_resp["error"], pd.DataFrame()
        return
    config = "default" if "default" in info_resp["dataset_info"] else next(iter(info_resp["dataset_info"]))
    features = Features.from_dict(info_resp["dataset_info"][config]["features"])
    split = "train" if "train" in info_resp["dataset_info"][config]["splits"] else next(iter(info_resp["dataset_info"][config]["splits"]))
    num_rows = min(info_resp["dataset_info"][config]["splits"][split]["num_examples"], MAX_ROWS)
    scanned_columns = get_columns_with_strings(features)
    columns_descriptions = [
        get_column_description(column_name, features[column_name]) for column_name in scanned_columns
    ]
    rows = track_iter(islice(stream_rows(dataset, config, split), MAX_ROWS))
    presidio_entities = []
    for presidio_entity in presidio_scan_entities(
        rows, scanned_columns=scanned_columns, columns_descriptions=columns_descriptions
    ):
        if presidio_entity["type"] in enabled_presidio_entities:
            presidio_entities.append(presidio_entity)
            yield f"βš™οΈ Scanning {dataset} [{rows.next_idx}/{num_rows} rows]:", pd.DataFrame(presidio_entities)
    yield f"βœ… Scanning {dataset} [{rows.next_idx}/{num_rows} rows]:", pd.DataFrame(presidio_entities)

with gr.Blocks() as demo:
    gr.Markdown("# Scan datasets using Presidio")
    gr.Markdown("The space takes an HF dataset name as an input, and returns the list of entities detected by Presidio in the first samples.")
    inputs = [
        HuggingfaceHubSearch(
            label="Hub Dataset ID",
            placeholder="Search for dataset id on Huggingface",
            search_type="dataset",
        ),
        gr.CheckBoxGroup(
            label="Presidio entities",
            choices=sorted(analyzer.get_supported_entities()),
            value=DEFAULT_PRESIDIO_ENTITIES,
            interative=True,
        ),
    ]
    button = gr.Button("Run Presidio Scan")
    outputs = [
        gr.Markdown(),
        gr.DataFrame(),
    ]
    button.click(analyze_dataset, inputs, outputs)
    gr.Examples(
        [
            ["microsoft/orca-math-word-problems-200k", DEFAULT_PRESIDIO_ENTITIES],
            ["tatsu-lab/alpaca", DEFAULT_PRESIDIO_ENTITIES],
            ["Anthropic/hh-rlhf", DEFAULT_PRESIDIO_ENTITIES],
            ["OpenAssistant/oasst1", DEFAULT_PRESIDIO_ENTITIES],
            ["sidhq/email-thread-summary", DEFAULT_PRESIDIO_ENTITIES],
            ["lhoestq/fake_name_and_ssn", DEFAULT_PRESIDIO_ENTITIES]
        ],
        inputs,
        outputs,
        fn=analyze_dataset,
        run_on_click=True
    )

demo.launch()