Upload dataset_processing_script.py with huggingface_hub
Browse files- dataset_processing_script.py +140 -0
dataset_processing_script.py
ADDED
@@ -0,0 +1,140 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
|
3 |
+
from datasets import load_dataset
|
4 |
+
from deepmultilingualpunctuation import PunctuationModel
|
5 |
+
from multiprocess import set_start_method
|
6 |
+
|
7 |
+
from nltk.tokenize import word_tokenize, sent_tokenize
|
8 |
+
from nltk.tag import pos_tag
|
9 |
+
|
10 |
+
import nltk
|
11 |
+
import spacy
|
12 |
+
|
13 |
+
# from rpunct import RestorePuncts
|
14 |
+
|
15 |
+
# rpunct = RestorePuncts()
|
16 |
+
|
17 |
+
model = PunctuationModel()
|
18 |
+
|
19 |
+
|
20 |
+
ds = load_dataset("ylacombe/mls-eng-tags", split = "train", num_proc=16)
|
21 |
+
|
22 |
+
def truecasing_by_pos(input_text):
|
23 |
+
|
24 |
+
# break input text to sentences
|
25 |
+
sent_texts = sent_tokenize(input_text)
|
26 |
+
|
27 |
+
full_text = ""
|
28 |
+
|
29 |
+
for sent_text in sent_texts:
|
30 |
+
# tokenize the text into words
|
31 |
+
words = word_tokenize(sent_text)
|
32 |
+
|
33 |
+
# apply POS-tagging on words
|
34 |
+
tagged_words = pos_tag([word.lower() for word in words])
|
35 |
+
|
36 |
+
# apply capitalization based on POS tags
|
37 |
+
capitalized_words = [w.capitalize() if t in ["NNP","NNPS"] else w for (w,t) in tagged_words]
|
38 |
+
|
39 |
+
# capitalize first word in sentence
|
40 |
+
capitalized_words[0] = capitalized_words[0].capitalize()
|
41 |
+
|
42 |
+
# join capitalized words
|
43 |
+
text_truecase = " ".join(capitalized_words)
|
44 |
+
|
45 |
+
full_text += text_truecase.strip()
|
46 |
+
|
47 |
+
return full_text.strip()
|
48 |
+
|
49 |
+
def true_case(text):
|
50 |
+
# Split the text into sentences
|
51 |
+
sentences = nltk.sent_tokenize(text)
|
52 |
+
|
53 |
+
# Process each sentence
|
54 |
+
true_cased_sentences = []
|
55 |
+
for sentence in sentences:
|
56 |
+
# Tokenize the sentence
|
57 |
+
tokens = nltk.word_tokenize(sentence)
|
58 |
+
|
59 |
+
# Perform POS tagging
|
60 |
+
tagged = nltk.pos_tag(tokens)
|
61 |
+
|
62 |
+
# Capitalize the first word of the sentence and NNP and NNPS tags
|
63 |
+
for i, (word, tag) in enumerate(tagged):
|
64 |
+
if i == 0 or tag in ('NNP', 'NNPS'):
|
65 |
+
tagged[i] = (word.capitalize(), tag)
|
66 |
+
|
67 |
+
# Join tokens back into a string, preserving punctuation
|
68 |
+
true_cased_sentence = ' '.join(word for word, tag in tagged)
|
69 |
+
|
70 |
+
# Remove spaces between punctuations and the preceding word
|
71 |
+
true_cased_sentence = re.sub(r'(\w) (\W)', r'\1\2', true_cased_sentence)
|
72 |
+
|
73 |
+
true_cased_sentences.append(true_cased_sentence)
|
74 |
+
|
75 |
+
# Join the processed sentences back into a single string
|
76 |
+
true_cased_text = ' '.join(true_cased_sentences)
|
77 |
+
|
78 |
+
return true_cased_text
|
79 |
+
|
80 |
+
spacy.require_gpu(gpu_id=2)
|
81 |
+
|
82 |
+
# Load the spaCy model
|
83 |
+
nlp = spacy.load('en_core_web_trf')
|
84 |
+
|
85 |
+
from spacy.util import compile_infix_regex
|
86 |
+
|
87 |
+
def custom_tokenizer(nlp):
|
88 |
+
infixes = nlp.Defaults.infixes + ['\w+(?:-\w+)+']
|
89 |
+
infix_regex = compile_infix_regex(infixes)
|
90 |
+
return spacy.tokenizer.Tokenizer(nlp.vocab, infix_finditer=infix_regex.finditer)
|
91 |
+
|
92 |
+
# Use the custom tokenizer
|
93 |
+
nlp.tokenizer = custom_tokenizer(nlp)
|
94 |
+
|
95 |
+
def true_case_spacy(text):
|
96 |
+
# Process the text with the spaCy model
|
97 |
+
doc = nlp(text)
|
98 |
+
|
99 |
+
# Initialize an empty list to hold the processed sentences
|
100 |
+
true_cased_sentences = []
|
101 |
+
|
102 |
+
# Iterate through the sentences in the Doc object
|
103 |
+
for sent in doc.sents:
|
104 |
+
# Initialize an empty list to hold the processed tokens of the current sentence
|
105 |
+
processed_tokens = []
|
106 |
+
|
107 |
+
# Iterate through the tokens in the current sentence
|
108 |
+
for i, token in enumerate(sent):
|
109 |
+
# Capitalize the first word of the sentence and proper nouns
|
110 |
+
if i == 0 or token.pos_ == 'PROPN':
|
111 |
+
processed_tokens.append(token.text.capitalize())
|
112 |
+
else:
|
113 |
+
processed_tokens.append(token.text)
|
114 |
+
|
115 |
+
# Join the processed tokens back into a string
|
116 |
+
processed_sentence = ' '.join(processed_tokens)
|
117 |
+
|
118 |
+
# Remove spaces between punctuations and the preceding word
|
119 |
+
processed_sentence = re.sub(r'(\w) (\W)', r'\1\2', processed_sentence)
|
120 |
+
|
121 |
+
# Add the processed sentence to the list of processed sentences
|
122 |
+
true_cased_sentences.append(processed_sentence)
|
123 |
+
|
124 |
+
# Join the processed sentences back into a single string
|
125 |
+
true_cased_text = ' '.join(true_cased_sentences)
|
126 |
+
|
127 |
+
return true_cased_text
|
128 |
+
|
129 |
+
|
130 |
+
def repunctuation_apply_simple(batch):
|
131 |
+
|
132 |
+
repunct_sample = model.restore_punctuation(batch["text"])
|
133 |
+
batch["repunct_text"] = true_case_spacy(repunct_sample)
|
134 |
+
|
135 |
+
return batch
|
136 |
+
|
137 |
+
if __name__ == "__main__":
|
138 |
+
set_start_method("spawn")
|
139 |
+
repunct_ds = ds.map(repunctuation_apply_simple, batch_size=1, num_proc=14)
|
140 |
+
repunct_ds.push_to_hub("reach-vb/mls-eng-tags-spacy-v2", split = "train")
|