add example code and readme

Files changed (5) hide show

README.md +4 -0
example/__pycache__/text_normalizer.cpython-39.pyc +0 -0
example/math_score.py +23 -0
example/perplexity.py +15 -0
example/text_normalizer.py +199 -0

README.md CHANGED Viewed

@@ -1,3 +1,7 @@
 ---
 license: apache-2.0
 ---

 ---
 license: apache-2.0
 ---
+This repository stores the MathScore and KenLM models used in the generation of OpenWebMath.
+To test the models, please `git clone` this repository and run `python perplexity.py` to test the KenLM model and `python math_score.py` to test the MathScore model.

example/__pycache__/text_normalizer.cpython-39.pyc ADDED Viewed

Binary file (4.47 kB). View file

example/math_score.py ADDED Viewed

	@@ -0,0 +1,23 @@

+import fasttext
+from text_normalizer import normalize
+def score_text(model, text):
+    normalized_text = normalize(text).replace('\n', ' ')
+    # Remove any [EQUATION] tokens
+    normalized_text = normalized_text.replace('[EQUATION]', '')
+    pred = model.predict(normalized_text, k=2)
+    if pred[0][0] == '__label__positive':
+        prob = pred[1][0]
+    else:
+        prob = pred[1][1]
+    return prob
+# Load the fasttext model
+model = fasttext.load_model('../math_score.bin')
+# Test the model
+TEXT = """I thought I’d add a little bit of background. The previous discussion started from the result $P(B|AC) = K^{-1}P(B|C)P(A|BC) = K^{-1} P(AB|C)$ where $K=P(A|C).$ Although this is called Bayes’ theorem, the general form of it as stated here was actually first written down, not by Bayes but by Laplace."""
+print(score_text(model, TEXT))  # Should print out 0.912

example/perplexity.py ADDED Viewed

	@@ -0,0 +1,15 @@

+import kenlm
+from text_normalizer import normalize
+def document_perplexity(model, text):
+    text = normalize(text)
+    score = model.score(text)
+    return 10 ** (-score / len(text.split()))
+# Load the language model
+model = kenlm.Model('../lm-v2.binary')
+# Test the model
+TEXT = """I thought I’d add a little bit of background. The previous discussion started from the result $P(B|AC) = K^{-1}P(B|C)P(A|BC) = K^{-1} P(AB|C)$ where $K=P(A|C).$ Although this is called Bayes’ theorem, the general form of it as stated here was actually first written down, not by Bayes but by Laplace."""
+print(document_perplexity(model, TEXT))  # Should print out ~239

example/text_normalizer.py ADDED Viewed

	@@ -0,0 +1,199 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+#
+# From https://github.com/facebookresearch/cc_net/blob/main/cc_net/text_normalizer.py
+import re
+import unicodedata
+UNICODE_PUNCT = {
+    "，": ",",
+    "。": ".",
+    "、": ",",
+    "„": '"',
+    "”": '"',
+    "“": '"',
+    "«": '"',
+    "»": '"',
+    "１": '"',
+    "」": '"',
+    "「": '"',
+    "《": '"',
+    "》": '"',
+    "´": "'",
+    "∶": ":",
+    "：": ":",
+    "？": "?",
+    "！": "!",
+    "（": "(",
+    "）": ")",
+    "；": ";",
+    "–": "-",
+    "—": " - ",
+    "．": ". ",
+    "～": "~",
+    "’": "'",
+    "…": "...",
+    "━": "-",
+    "〈": "<",
+    "〉": ">",
+    "【": "[",
+    "】": "]",
+    "％": "%",
+    "►": "-",
+}
+UNICODE_PUNCT_RE = re.compile(f"[{''.join(UNICODE_PUNCT.keys())}]")
+MATH_RE = r"(?<!\\)(\$\$?.+?\$\$?)"
+CODE_RE = r'\`{1,3}.*?\`{1,3}'
+def replace_unicode_punct(text: str) -> str:
+    return "".join((UNICODE_PUNCT.get(c, c) for c in text))
+def remove_unicode_punct(text: str) -> str:
+    """More aggressive version of replace_unicode_punct but also faster."""
+    return UNICODE_PUNCT_RE.sub("", text)
+def strip_accents(line: str) -> str:
+    """Strips accents from a piece of text."""
+    nfd = unicodedata.normalize("NFD", line)
+    output = [c for c in nfd if unicodedata.category(c) != "Mn"]
+    if len(output) == line:
+        return line
+    return "".join(output)
+# Build a regex matching all control characters.
+NON_PRINTING_CHARS_RE = re.compile(
+    f"[{''.join(map(chr, list(range(0,32)) + list(range(127,160))))}]"
+)
+DIGIT_RE = re.compile(r"\d")
+PUNCT_OR_NON_PRINTING_CHARS_RE = re.compile(
+    (UNICODE_PUNCT_RE.pattern + NON_PRINTING_CHARS_RE.pattern).replace("][", "")
+)
+def remove_non_printing_char(text: str) -> str:
+    return NON_PRINTING_CHARS_RE.sub("", text)
+def normalize_spacing_for_tok(text: str, language: str = "en") -> str:
+    res = (
+        text.replace("\r", "")
+        # remove extra spaces
+        .replace("(", " (")
+        .replace(")", ") ")
+        .replace(" +", " ")
+    )
+    res = re.sub(r"\) ([\.\!\:\?\;\,])", r"\)\1", res)
+    res = res.replace("( ", "(").replace(" )", ")")
+    res = re.sub(r"(\d) \%", r"\1\%", res)
+    res = res.replace(" :", ":").replace(" ;", ";")
+    res = res.replace("`", "'").replace("''", ' " ')
+    res = (
+        res.replace("„", '"')
+        .replace("“", '"')
+        .replace("”", '"')
+        .replace("–", "-")
+        .replace("—", " - ")
+        .replace(" +", " ")
+        .replace("´", "'")
+        .replace("([a-z])‘([a-z])", r"\1'\2/")
+        .replace("([a-z])’([a-z])", r"\1'\2/")
+        .replace("‘", '"')
+        .replace("‚", '"')
+        .replace("’", '"')
+        .replace("''", '"')
+        .replace("´´", '"')
+        .replace("…", "...")
+        # French quotes
+        .replace(" « ", ' "')
+        .replace("« ", '"')
+        .replace("«", '"')
+        .replace(" » ", '" ')
+        .replace(" »", '"')
+        .replace("»", '"')
+        # handle pseudo-spaces
+        .replace(" %", "%")
+        .replace("nº ", "nº ")
+        .replace(" :", ":")
+        .replace(" ºC", " ºC")
+        .replace(" cm", " cm")
+        .replace(" ?", "?")
+        .replace(" !", "!")
+        .replace(" ;", ";")
+        .replace(", ", ", ")
+        .replace(" +", " ")
+        .replace("．", ". ")
+    )
+    # English "quotation," followed by comma, style
+    if language == "en":
+        res = re.sub(r"\"([,\.]+)", r"\1\"", res)
+    # Czech is confused
+    elif language == "cs" or language == "cz":
+        pass
+    # German/Spanish/French "quotation", followed by comma, style
+    else:
+        res = res.replace(',"', '",')
+        res = re.sub(
+            r"(\.+)\"(\s*[^<])", r"\"\1\2", res
+        )  # don't fix period at end of sentence
+    if (
+        language == "de"
+        or language == "es"
+        or language == "cz"
+        or language == "cs"
+        or language == "fr"
+    ):
+        res = re.sub(r"(\d) (\d)", r"\1,\2", res)
+    else:
+        res = re.sub(r"(\d) (\d)", r"\1.\2", res)
+    return res
+def normalize(line: str, accent=True, case=True, numbers=True, math=True, code=True, punct=1) -> str:
+    line = line.strip()
+    if not line:
+        return line
+    if case:
+        line = line.lower()
+    if accent:
+        line = strip_accents(line)
+    if numbers:
+        line = DIGIT_RE.sub("0", line)
+    if punct == 1:
+        line = replace_unicode_punct(line)
+    elif punct == 2:
+        line = remove_unicode_punct(line)
+    if math:
+        line = re.sub(MATH_RE, "[EQUATION]", line, flags=re.DOTALL)
+    if code:
+        line = re.sub(CODE_RE, "[CODE]", line, flags=re.DOTALL)
+    # Replace any <s> or </s> explicitly written in the text with nothing
+    line = line.replace("<s>", "").replace("</s>", "")
+    line = remove_non_printing_char(line)
+    return line
+def slow_normalize_for_dedup(line: str) -> str:
+    return normalize(line, accent=False, case=True, numbers=True, punct=2)
+def normalize_for_dedup(line: str) -> str:
+    line = line.strip()
+    if not line:
+        return line
+    # case
+    line = line.lower()
+    # numbers
+    line = DIGIT_RE.sub("0", line)
+    line = PUNCT_OR_NON_PRINTING_CHARS_RE.sub("", line)
+    return line