bigscience-workshop
diff --git a/‎ac_dc/anonymization.py
+1-1 b/‎ac_dc/anonymization.py
+1-1
diff --git a/‎ac_dc/deduplicate/self_deduplicate.py
+1-2 b/‎ac_dc/deduplicate/self_deduplicate.py
+1-2
diff --git a/‎ac_dc/visualization/get_data_for_visualization.py
+3-3 b/‎ac_dc/visualization/get_data_for_visualization.py
+3-3
diff --git a/‎ac_dc/visualization/visualization.py
+20-20 b/‎ac_dc/visualization/visualization.py
+20-20
diff --git a/‎bertin/evaluation/run_glue.py
+10-7 b/‎bertin/evaluation/run_glue.py
+10-7
diff --git a/‎bertin/evaluation/run_ner.py
+8-7 b/‎bertin/evaluation/run_ner.py
+8-7
diff --git a/‎bertin/mc4/mc4.py
+1-2 b/‎bertin/mc4/mc4.py
+1-2
diff --git a/‎bertin/run_mlm_flax.py
-1 b/‎bertin/run_mlm_flax.py
-1
diff --git a/‎bertin/run_mlm_flax_stream.py
+1-2 b/‎bertin/run_mlm_flax_stream.py
+1-2
diff --git a/‎bertin/utils/dataset_perplexity.py
+1-1 b/‎bertin/utils/dataset_perplexity.py
+1-1
diff --git a/‎cc_pseudo_crawl/python_scripts/deeper.py
+1 b/‎cc_pseudo_crawl/python_scripts/deeper.py
+1
diff --git a/‎cc_pseudo_crawl/python_scripts/download_warc.py
+2-2 b/‎cc_pseudo_crawl/python_scripts/download_warc.py
+2-2
diff --git a/‎cc_pseudo_crawl/python_scripts/exact_deduplicates.py
+1 b/‎cc_pseudo_crawl/python_scripts/exact_deduplicates.py
+1
diff --git a/‎cc_pseudo_crawl/python_scripts/load_all_seed_ids.py
+1-1 b/‎cc_pseudo_crawl/python_scripts/load_all_seed_ids.py
+1-1
diff --git a/‎cc_pseudo_crawl/python_scripts/pseudo_crawl_seed_to_lm_dset_v2.py
+1-1 b/‎cc_pseudo_crawl/python_scripts/pseudo_crawl_seed_to_lm_dset_v2.py
+1-1
diff --git a/‎cc_pseudo_crawl/python_scripts/shard_by_seed_id.py
+1 b/‎cc_pseudo_crawl/python_scripts/shard_by_seed_id.py
+1
diff --git a/‎kenlm_training/cc_net/execution.py
+1-2 b/‎kenlm_training/cc_net/execution.py
+1-2
diff --git a/‎kenlm_training/cc_net/flat_hash_set.py
+6-12 b/‎kenlm_training/cc_net/flat_hash_set.py
+6-12
diff --git a/‎kenlm_training/cc_net/jsonql.py
+7-12 b/‎kenlm_training/cc_net/jsonql.py
+7-12
diff --git a/‎kenlm_training/tests/test_jsonql.py
+1-1 b/‎kenlm_training/tests/test_jsonql.py
+1-1
diff --git a/‎perplexity_lenses/perplexity_lenses/data.py
+8-6 b/‎perplexity_lenses/perplexity_lenses/data.py
+8-6
@@ -30,7 +30,7 @@ def apply_regex_anonymization(
         tag_type=tag_type,
     )
     if anonymize_condition:
-        for (ent, start, end, tag) in ner:
+        for ent, start, end, tag in ner:
             # we need to actually walk through and replace by start, end span.
             sentence = sentence.replace(ent, f" <{tag}> ")
     return sentence, ner
@@ -1,5 +1,4 @@
 #!/usr/bin/env python
-# -*- coding: utf-8 -*-
 # @Date       : 2022-01-08 22:39:29
 # @Author     : Chenghao Mou ([email protected])
 # @Description: Self-deduplication with `datasets`
@@ -28,7 +27,7 @@
 
 def main(conf: str) -> None:
 
-    with open(conf, "r") as f:
+    with open(conf) as f:
         conf = yaml.safe_load(f.read())
 
     if conf["load_from_disk"]["path"]:
 
@@ -90,9 +90,9 @@ def compute_stats(self):
                     )
                     for n in range(2, 16)
                 }
-                stats_document[
-                    "character_repetition_ratio"
-                ] = character_repetition_ratios
+                stats_document["character_repetition_ratio"] = (
+                    character_repetition_ratios
+                )
 
                 word_repetition_ratios = {
                     n: round(
 
@@ -290,16 +290,16 @@ def get_cond(key, cutoff, max_cutoff):
                             "stopwords_ratio"
                         ]
                         for i in range(len(self.docs["stopwords_ratio"])):
-                            self.docs["stopwords_ratio"].iloc[
-                                i
-                            ] = Filtering.compute_stopwords_ratio(
-                                self.docs["text"].iloc[i],
-                                self.sentencepiece_model_tok,
-                                self.param["strip_characters"],
-                                self.param["cond_words_augmentation"],
-                                self.param["words_augmentation_group_sizes"],
-                                self.param["words_augmentation_join_char"],
-                                new_stopwords,
+                            self.docs["stopwords_ratio"].iloc[i] = (
+                                Filtering.compute_stopwords_ratio(
+                                    self.docs["text"].iloc[i],
+                                    self.sentencepiece_model_tok,
+                                    self.param["strip_characters"],
+                                    self.param["cond_words_augmentation"],
+                                    self.param["words_augmentation_group_sizes"],
+                                    self.param["words_augmentation_join_char"],
+                                    new_stopwords,
+                                )
                             )
                     cutoff_def = "If the stop words ratio of a document is lower than this number, the document is removed."
                     cutoff_stopwords_ratio = st.slider(
@@ -326,16 +326,16 @@ def get_cond(key, cutoff, max_cutoff):
                             "flagged_words_ratio"
                         ]
                         for i in range(len(self.docs["flagged_words_ratio"])):
-                            self.docs["flagged_words_ratio"].iloc[
-                                i
-                            ] = Filtering.compute_flagged_words_ratio(
-                                self.docs["text"].iloc[i],
-                                self.sentencepiece_model_tok,
-                                self.param["strip_characters"],
-                                self.param["cond_words_augmentation"],
-                                self.param["words_augmentation_group_sizes"],
-                                self.param["words_augmentation_join_char"],
-                                new_flagged_words,
+                            self.docs["flagged_words_ratio"].iloc[i] = (
+                                Filtering.compute_flagged_words_ratio(
+                                    self.docs["text"].iloc[i],
+                                    self.sentencepiece_model_tok,
+                                    self.param["strip_characters"],
+                                    self.param["cond_words_augmentation"],
+                                    self.param["words_augmentation_group_sizes"],
+                                    self.param["words_augmentation_join_char"],
+                                    new_flagged_words,
+                                )
                             )
                     cutoff_def = "If the flagged words ratio of a document is higher than this number, the document is removed."
                     max_fwr = np.max(self.docs["flagged_words_ratio"])
 
@@ -1,5 +1,4 @@
 #!/usr/bin/env python
-# coding=utf-8
 # Copyright 2020 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -384,19 +383,23 @@ def main():
     # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently
     # download model & vocab.
     config = AutoConfig.from_pretrained(
-        model_args.config_name
-        if model_args.config_name
-        else model_args.model_name_or_path,
+        (
+            model_args.config_name
+            if model_args.config_name
+            else model_args.model_name_or_path
+        ),
         num_labels=num_labels,
         finetuning_task=data_args.task_name,
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
         use_auth_token=True if model_args.use_auth_token else None,
     )
     tokenizer = AutoTokenizer.from_pretrained(
-        model_args.tokenizer_name
-        if model_args.tokenizer_name
-        else model_args.model_name_or_path,
+        (
+            model_args.tokenizer_name
+            if model_args.tokenizer_name
+            else model_args.model_name_or_path
+        ),
         cache_dir=model_args.cache_dir,
         use_fast=model_args.use_fast_tokenizer,
         revision=model_args.model_revision,
 
@@ -1,5 +1,4 @@
 #!/usr/bin/env python
-# coding=utf-8
 # Copyright 2020 The HuggingFace Team All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -364,9 +363,11 @@ def get_label_list(labels):
     # The .from_pretrained methods guarantee that only one local process can concurrently
     # download model & vocab.
     config = AutoConfig.from_pretrained(
-        model_args.config_name
-        if model_args.config_name
-        else model_args.model_name_or_path,
+        (
+            model_args.config_name
+            if model_args.config_name
+            else model_args.model_name_or_path
+        ),
         num_labels=num_labels,
         label2id=label_to_id,
         id2label={i: l for l, i in label_to_id.items()},
@@ -636,9 +637,9 @@ def compute_metrics(p):
             kwargs["dataset_tags"] = data_args.dataset_name
             if data_args.dataset_config_name is not None:
                 kwargs["dataset_args"] = data_args.dataset_config_name
-                kwargs[
-                    "dataset"
-                ] = f"{data_args.dataset_name} {data_args.dataset_config_name}"
+                kwargs["dataset"] = (
+                    f"{data_args.dataset_name} {data_args.dataset_config_name}"
+                )
             else:
                 kwargs["dataset"] = data_args.dataset_name
 
 
@@ -1,6 +1,5 @@
 """Perplexity Sampled mC4 dataset based on Common Crawl."""
 
-
 import gzip
 import json
 
@@ -404,7 +403,7 @@ def _generate_examples(self, filepaths):
         for filepath in filepaths:
             logger.info("generating examples from = %s", filepath)
             if filepath.endswith("jsonl"):
-                with open(filepath, "r", encoding="utf-8") as f:
+                with open(filepath, encoding="utf-8") as f:
                     for line in f:
                         if line:
                             example = json.loads(line)
 
@@ -1,5 +1,4 @@
 #!/usr/bin/env python
-# coding=utf-8
 # Copyright 2021 The HuggingFace Team All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 
@@ -1,5 +1,4 @@
 #!/usr/bin/env python
-# coding=utf-8
 # Copyright 2021 The HuggingFace Team All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -446,7 +445,7 @@ def restore_checkpoint(save_dir, state):
     args = joblib.load(os.path.join(save_dir, "training_args.joblib"))
     data_collator = joblib.load(os.path.join(save_dir, "data_collator.joblib"))
 
-    with open(os.path.join(save_dir, "training_state.json"), "r") as f:
+    with open(os.path.join(save_dir, "training_state.json")) as f:
         training_state = json.load(f)
     step = training_state["step"]
 
 
@@ -17,7 +17,7 @@ def get_perplexity(doc):
 
 
 with open("mc4-es-train-50M-stats.csv", "w") as csv:
-    with open("mc4-es-train-50M-steps.jsonl", "r") as data:
+    with open("mc4-es-train-50M-steps.jsonl") as data:
         for line in tqdm(data):
             text = json.loads(line)["text"]
             csv.write(f"{len(text.split())},{get_perplexity(text)}\n")
@@ -1,6 +1,7 @@
 """
 Generate list of urls to query for next depth. We then need to use Athena to make a fancy query.
 """
+
 import csv
 import re
 import subprocess
 
@@ -143,9 +143,9 @@ def get_warcs(batch):
         existing_compressed_warcs,
     )
 
-    batch["compressed_warc"], batch["download_exception"] = [
+    batch["compressed_warc"], batch["download_exception"] = (
         list(l) for l in zip(*warcs_or_exceptions)
-    ]
+    )
     return batch
 
 
 
@@ -1,4 +1,5 @@
 """Taken from Teven and Leandro"""
+
 import gzip
 import os
 import shutil
 
@@ -21,7 +21,7 @@ def main():
 
     seed_ids = []
     for seed_path in args.seed_paths:
-        with open(seed_path, "r") as fi:
+        with open(seed_path) as fi:
             data = csv.reader(fi)
             # First line is all the headers that we remove.
             seed_ids += [row[0] for row_id, row in enumerate(data) if row_id > 0]
 
@@ -126,7 +126,7 @@ def process_batch(batch, skip_set):
 # looks at up to the first 10K pages for a seed and
 # records lines that appear in at least 1% of the unique pages
 def get_lines_to_skip(dset, n_records, pourcentage_threshold, min_repetition_threshold):
-    line_counts = defaultdict(lambda: 0)
+    line_counts = defaultdict(int)
     seen_pages = set()
 
     seed = SeedSequence(42)
 
@@ -1,6 +1,7 @@
 """
 Deduplicating using `datasets` is much harder, we but we forgot to generate an id when building an index, so we're screwed.
 """
+
 import logging
 import subprocess
 from argparse import ArgumentParser
 
@@ -19,8 +19,7 @@
 
 
 class Executor(Protocol):
-    def __call__(self, function: Callable[..., str], *args: Iterable) -> None:
-        ...
+    def __call__(self, function: Callable[..., str], *args: Iterable) -> None: ...
 
 
 class SubmititRetryOnTimeout(submitit.helpers.Checkpointable):
 
@@ -29,23 +29,17 @@ def __repr__(self):
         implementation = type(self).__name__
         return f"[{implementation}, len: {len(self)}"
 
-    def __len__(self) -> int:
-        ...
+    def __len__(self) -> int: ...
 
-    def __contains__(self, values: Sequence[np.uint64]) -> np.ndarray:
-        ...
+    def __contains__(self, values: Sequence[np.uint64]) -> np.ndarray: ...
 
-    def __getitem__(self, values) -> np.ndarray:
-        ...
+    def __getitem__(self, values) -> np.ndarray: ...
 
-    def __setitem__(self, keys, values) -> None:
-        ...
+    def __setitem__(self, keys, values) -> None: ...
 
-    def items(self) -> Iterable[Tuple[np.uint64, np.uint8]]:
-        ...
+    def items(self) -> Iterable[Tuple[np.uint64, np.uint8]]: ...
 
-    def keys(self) -> Iterable[np.uint64]:
-        ...
+    def keys(self) -> Iterable[np.uint64]: ...
 
     def __iter__(self) -> Iterator[np.uint64]:
         return iter(self.keys())
 
@@ -880,8 +880,7 @@ def describe(source, columns=None, weights=None, **kwargs):
             continue
         if "." in k or k == ALL_DOCUMENTS:
             continue
-        for line in display_stats(stats, k, weights=weights, **kwargs):
-            yield line
+        yield from display_stats(stats, k, weights=weights, **kwargs)
 
 
 def shard(lines):
@@ -902,17 +901,13 @@ def get_or_set(dictionary, key, default):
 class SimpleIO(Protocol):
     """A subset of methods from TextIO."""
 
-    def close(self) -> None:
-        ...
+    def close(self) -> None: ...
 
-    def write(self, line: str) -> int:
-        ...
+    def write(self, line: str) -> int: ...
 
-    def __enter__(self) -> "SimpleIO":
-        ...
+    def __enter__(self) -> "SimpleIO": ...
 
-    def __exit__(self, exc_type, exc_value, traceback):
-        ...
+    def __exit__(self, exc_type, exc_value, traceback): ...
 
 
 def open_read(filename: ReadableFileLike) -> Iterable[str]:
@@ -961,7 +956,7 @@ def open_read(filename: ReadableFileLike) -> Iterable[str]:
     if filename.suffix == ".gz":
         file: TextIO = gzip.open(filename, "rt")  # type: ignore
     else:
-        file = open(filename, "rt")
+        file = open(filename)
 
     return _close_when_exhausted(file)
 
@@ -1015,7 +1010,7 @@ def open_write(
     if filename.suffix == ".gz":
         return BlockedGzipWriter(Path(filename), mode, block_size="64M")
 
-    return open(filename, "wt")
+    return open(filename, "w")
 
 
 def parse_size(size):
 
@@ -262,7 +262,7 @@ def do(self, x):
     def acc(values):
         print("acc: started")
         res = 0
-        for (x, _) in values:
+        for x, _ in values:
             res += int(x)
         print("acc: done")
         yield f"acc: result={res}"
 
@@ -34,9 +34,11 @@ def hub_dataset_to_dataframe(
                 {
                     text_column: sentence,
                     "perplexity": model.get_perplexity(sentence),
-                    "label": x.get("labels", [])[0]
-                    if len(x.get("labels", [])) > 0
-                    else "NONE",  # Special case for registry dataset
+                    "label": (
+                        x.get("labels", [])[0]
+                        if len(x.get("labels", [])) > 0
+                        else "NONE"
+                    ),  # Special case for registry dataset
                 }
                 for sentence in x[text_column].split("\n")
             ]
@@ -46,9 +48,9 @@ def hub_dataset_to_dataframe(
             lambda x: {
                 text_column: x[text_column],
                 "perplexity": model.get_perplexity(x[text_column]),
-                "label": x.get("labels", [])[0]
-                if len(x.get("labels", [])) > 0
-                else "NONE",  # Special case for registry dataset
+                "label": (
+                    x.get("labels", [])[0] if len(x.get("labels", [])) > 0 else "NONE"
+                ),  # Special case for registry dataset
             }
         )
     instances = []
Original file line number	Diff line number	Diff line change
`@@ -30,7 +30,7 @@ def apply_regex_anonymization(`
`30`	`30`	`tag_type=tag_type,`
`31`	`31`	`)`
`32`	`32`	`if anonymize_condition:`
`33`		`- for (ent, start, end, tag) in ner:`
	`33`	`+ for ent, start, end, tag in ner:`
`34`	`34`	`# we need to actually walk through and replace by start, end span.`
`35`	`35`	`sentence = sentence.replace(ent, f" <{tag}> ")`
`36`	`36`	`return sentence, ner`
Original file line number	Diff line number	Diff line change
`@@ -90,9 +90,9 @@ def compute_stats(self):`
`90`	`90`	`)`
`91`	`91`	`for n in range(2, 16)`
`92`	`92`	`}`
`93`		`- stats_document[`
`94`		`- "character_repetition_ratio"`
`95`		`- ] = character_repetition_ratios`
	`93`	`+ stats_document["character_repetition_ratio"] = (`
	`94`	`+ character_repetition_ratios`
	`95`	`+ )`
`96`	`96`
`97`	`97`	`word_repetition_ratios = {`
`98`	`98`	`n: round(`
Original file line number	Diff line number	Diff line change
`@@ -1,5 +1,4 @@`
`1`	`1`	`#!/usr/bin/env python`
`2`		`-# coding=utf-8`
`3`	`2`	`# Copyright 2021 The HuggingFace Team All rights reserved.`
`4`	`3`	`#`
`5`	`4`	`# Licensed under the Apache License, Version 2.0 (the "License");`
Original file line number	Diff line number	Diff line change
`@@ -143,9 +143,9 @@ def get_warcs(batch):`
`143`	`143`	`existing_compressed_warcs,`
`144`	`144`	`)`
`145`	`145`
`146`		`- batch["compressed_warc"], batch["download_exception"] = [`
	`146`	`+ batch["compressed_warc"], batch["download_exception"] = (`
`147`	`147`	`list(l) for l in zip(*warcs_or_exceptions)`
`148`		`- ]`
	`148`	`+ )`
`149`	`149`	`return batch`
`150`	`150`
`151`	`151`
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,5 @@`
`1`	`1`	`"""Taken from Teven and Leandro"""`
	`2`	`+`
`2`	`3`	`import gzip`
`3`	`4`	`import os`
`4`	`5`	`import shutil`
Original file line number	Diff line number	Diff line change
`@@ -34,9 +34,11 @@ def hub_dataset_to_dataframe(`
`34`	`34`	`{`
`35`	`35`	`text_column: sentence,`
`36`	`36`	`"perplexity": model.get_perplexity(sentence),`
`37`		`- "label": x.get("labels", [])[0]`
`38`		`- if len(x.get("labels", [])) > 0`
`39`		`- else "NONE", # Special case for registry dataset`
	`37`	`+ "label": (`
	`38`	`+ x.get("labels", [])[0]`
	`39`	`+ if len(x.get("labels", [])) > 0`
	`40`	`+ else "NONE"`
	`41`	`+ ), # Special case for registry dataset`
`40`	`42`	`}`
`41`	`43`	`for sentence in x[text_column].split("\n")`
`42`	`44`	`]`
`@@ -46,9 +48,9 @@ def hub_dataset_to_dataframe(`
`46`	`48`	`lambda x: {`
`47`	`49`	`text_column: x[text_column],`
`48`	`50`	`"perplexity": model.get_perplexity(x[text_column]),`
`49`		`- "label": x.get("labels", [])[0]`
`50`		`- if len(x.get("labels", [])) > 0`
`51`		`- else "NONE", # Special case for registry dataset`
	`51`	`+ "label": (`
	`52`	`+ x.get("labels", [])[0] if len(x.get("labels", [])) > 0 else "NONE"`
	`53`	`+ ), # Special case for registry dataset`
`52`	`54`	`}`
`53`	`55`	`)`
`54`	`56`	`instances = []`