improve the class material (mae-ac/language-detection!2)

- add some better emojis to the README - move the test imports inside the test functions: because the students will have to write functions that do not exist at the start of the class, the error message are pretty obscur ``` pytest ============================= test session starts ============================== platform linux -- Python 3.10.12, pytest-8.2.2, pluggy-1.5.0 rootdir: /home/disc/a.stevan/documents/repos/gitlab.isae-supaero.fr/mae/language-detection collected 0 items / 5 errors ==================================== ERRORS ==================================== _________ ERROR collecting tests/test_compute_occurence_frequencies.py _________ ImportError while importing test module '/home/disc/a.stevan/documents/repos/gitlab.isae-supaero.fr/mae/language-detection/tests/test_compute_occurence_frequencies.py'. Hint: make sure your test modules/packages have valid Python names. Traceback: /usr/lib/python3.10/importlib/__init__.py:126: in import_module return _bootstrap._gcd_import(name[level:], package, level) tests/test_compute_occurence_frequencies.py:1: in <module> from src.enhanced_occurrence_frequencies import compute_occurence_frequencies E ModuleNotFoundError: No module named 'src.enhanced_occurrence_frequencies' ________________ ERROR collecting tests/test_detect_language.py ________________ ImportError while importing test module '/home/disc/a.stevan/documents/repos/gitlab.isae-supaero.fr/mae/language-detection/tests/test_detect_language.py'. Hint: make sure your test modules/packages have valid Python names. Traceback: /usr/lib/python3.10/importlib/__init__.py:126: in import_module return _bootstrap._gcd_import(name[level:], package, level) tests/test_detect_language.py:1: in <module> from src.enhanced_occurrence_frequencies import ( E ModuleNotFoundError: No module named 'src.enhanced_occurrence_frequencies' ____________________ ERROR collecting tests/test_entropy.py ____________________ ImportError while importing test module '/home/disc/a.stevan/documents/repos/gitlab.isae-supaero.fr/mae/language-detection/tests/test_entropy.py'. Hint: make sure your test modules/packages have valid Python names. Traceback: /usr/lib/python3.10/importlib/__init__.py:126: in import_module return _bootstrap._gcd_import(name[level:], package, level) tests/test_entropy.py:1: in <module> from src.data_compression import entropy E ModuleNotFoundError: No module named 'src.data_compression' ____________________ ERROR collecting tests/test_kl_div.py _____________________ ImportError while importing test module '/home/disc/a.stevan/documents/repos/gitlab.isae-supaero.fr/mae/language-detection/tests/test_kl_div.py'. Hint: make sure your test modules/packages have valid Python names. Traceback: /usr/lib/python3.10/importlib/__init__.py:126: in import_module return _bootstrap._gcd_import(name[level:], package, level) tests/test_kl_div.py:1: in <module> from src.enhanced_occurrence_frequencies import ( E ModuleNotFoundError: No module named 'src.enhanced_occurrence_frequencies' ___________________ ERROR collecting tests/test_read_text.py ___________________ ImportError while importing test module '/home/disc/a.stevan/documents/repos/gitlab.isae-supaero.fr/mae/language-detection/tests/test_read_text.py'. Hint: make sure your test modules/packages have valid Python names. Traceback: /usr/lib/python3.10/importlib/__init__.py:126: in import_module return _bootstrap._gcd_import(name[level:], package, level) tests/test_read_text.py:1: in <module> from src.enhanced_occurrence_frequencies import read_text E ModuleNotFoundError: No module named 'src.enhanced_occurrence_frequencies' =========================== short test summary info ============================ ERROR tests/test_compute_occurence_frequencies.py ERROR tests/test_detect_language.py ERROR tests/test_entropy.py ERROR tests/test_kl_div.py ERROR tests/test_read_text.py !!!!!!!!!!!!!!!!!!! Interrupted: 5 errors during collection !!!!!!!!!!!!!!!!!!!! ============================== 5 errors in 0.04s =============================== ``` now the errors will show ``` pytest ============================= test session starts ============================== platform linux -- Python 3.10.12, pytest-8.2.2, pluggy-1.5.0 rootdir: /home/disc/a.stevan/documents/repos/gitlab.isae-supaero.fr/mae/language-detection collected 6 items tests/test_compute_occurence_frequencies.py FF [ 33%] tests/test_detect_language.py F [ 50%] tests/test_entropy.py F [ 66%] tests/test_kl_div.py F [ 83%] tests/test_read_text.py F [100%] =================================== FAILURES =================================== __________________________________ test_type ___________________________________ def test_type(): > from src.enhanced_occurrence_frequencies import compute_occurence_frequencies E ModuleNotFoundError: No module named 'src.enhanced_occurrence_frequencies' tests/test_compute_occurence_frequencies.py:5: ModuleNotFoundError _________________________________ test_output __________________________________ def test_output(): > from src.enhanced_occurrence_frequencies import compute_occurence_frequencies E ModuleNotFoundError: No module named 'src.enhanced_occurrence_frequencies' tests/test_compute_occurence_frequencies.py:16: ModuleNotFoundError _____________________________ test_detect_language _____________________________ def test_detect_language(): > from src.enhanced_occurrence_frequencies import ( compute_occurence_frequencies, read_text, detect_language ) E ModuleNotFoundError: No module named 'src.enhanced_occurrence_frequencies' tests/test_detect_language.py:5: ModuleNotFoundError _____________________________________ test _____________________________________ def test(): > from src.data_compression import entropy E ModuleNotFoundError: No module named 'src.data_compression' tests/test_entropy.py:12: ModuleNotFoundError _________________________________ test_kl_div __________________________________ def test_kl_div(): > from src.enhanced_occurrence_frequencies import ( read_text, compute_occurence_frequencies, kl_divergence ) E ModuleNotFoundError: No module named 'src.enhanced_occurrence_frequencies' tests/test_kl_div.py:5: ModuleNotFoundError ________________________________ test_read_text ________________________________ def test_read_text(): > from src.enhanced_occurrence_frequencies import read_text E ModuleNotFoundError: No module named 'src.enhanced_occurrence_frequencies' tests/test_read_text.py:2: ModuleNotFoundError =========================== short test summary info ============================ FAILED tests/test_compute_occurence_frequencies.py::test_type - ModuleNotFoun... FAILED tests/test_compute_occurence_frequencies.py::test_output - ModuleNotFo... FAILED tests/test_detect_language.py::test_detect_language - ModuleNotFoundEr... FAILED tests/test_entropy.py::test - ModuleNotFoundError: No module named 'sr... FAILED tests/test_kl_div.py::test_kl_div - ModuleNotFoundError: No module nam... FAILED tests/test_read_text.py::test_read_text - ModuleNotFoundError: No modu... ============================== 6 failed in 0.02s =============================== ```

improve the class material (mae-ac/language-detection!2)
308e7019 · STEVAN Antoine · b9489f80 · 308e7019 · 308e7019 · 308e7019
Commit 308e7019 authored Jul 9, 2024 by STEVAN Antoine
--- a/README.md
+++ b/README.md
@@ -44,19 +44,21 @@ make test

 Your first objective is to find the occurrence frequencies of the letters of the alphabet in a text.

-Open the [`occurrence_frequencies.py`](src/occurrence_frequencies.py) script in your editor of choice and fill it. This file already contains the steps to follow :
+:pencil: Open the [`occurrence_frequencies.py`](src/occurrence_frequencies.py) script in your editor
+of choice and fill it. This file already contains the steps to follow :
 - Create a dictionary containing the letters with the occurrences equal to 0
 - For each letter in the text, increment the corresponding entry of the dictionary
 - Normalize the values of the dictionary in order to have frequencies (the sum is equal to 1)

 ## Q2. Enhanced occurrence frequency analysis [[toc](#table-of-content)]
-> :exclamation: **Important**
->
-> there are tests that should pass
+:file_folder: Create a new file called `enhanced_occurrence_frequencies.py` in the `src/` directory.
+
+:gear: you can run the `make test` command and see 6 failing tests. These tests should hopefully
+pass at the end of this question!

 Now we will use the [_Wuthering Heights_ by Emily Bronte](data/english_wuthering_heights.txt) text and analyze it.

-You should now be able to answer the following questions. Your work must be written in a new Python program, called `enhanced_occurrence_frequencies.py` in the `src` directory:
+:pencil: You should now be able to answer the following questions:
 1. Write a function `read_text` to read a text file and return a string text containing only lower case characters.
 > - arguments:
 >   1. filename: `str`
@@ -75,11 +77,13 @@ You should now be able to answer the following questions. Your work must be writ
 $$D_{KL}(P,Q)=\sum_{i=1}^{n} p_i \log_2 \frac{p_i}{q_i}$$

 > :bulb: **Note**
+>
 > All the $q_i$ must be strictly positive. If some of them are equal to zero, do not consider the corresponding symbol.

 Since the KL divergence is not symmetric, you can symmetrize it by computing the absolute value of the average of $D_{KL}(P,Q)$ and $D_{KL}(Q,P)$.

 > :bulb: **Note**
+>
 > You need the `log` and the `fabs` functions. Since there are not loaded by default, you must add the line `from math import log,fabs` at the beginning of your file.

 Write a function `kl_divergence` that takes two probability distributions in the form of previously computed dictionaries and returns the symmetric _Kullback-Leibler_ divergence.
@@ -99,9 +103,7 @@ Write a function `kl_divergence` that takes two probability distributions in the
 ---

 ## Q3. Influence of the text length on the quality of detection [[toc](#table-of-content)]
-> :exclamation: **Important**
->
-> your code should be written in `src/text_length.py`
+:file_folder: Create a new file `src/text_length.py`.

 The accuracy of the language detector depends on the length of the input text. The objective of this exercise is to evaluate this accuracy according to the length of the text.

@@ -109,6 +111,7 @@ The accuracy of the language detector depends on the length of the input text. T
 For each text length, evaluate the probability of good detection and plot the result with `matplotlib`.

 > :bulb: **Note**
+>
 > the details of this question have been intentionally left as an exercise to the students.
 > it's time for you to shine and experiment what you think would be a good approach.
 >
@@ -118,9 +121,7 @@ For each text length, evaluate the probability of good detection and plot the re
 To try improve this result, you can try to capture the redundancy of a language in the sequences of 2 (or more) letters. For that, consider the pairs of consecutive letters as a symbol (of course, the size of your alphabet of sybols will increase) and do the same analysis to check if you can improve the detection results.

 ## Q4. Compression with Huffman [[toc](#table-of-content)]
-> :exclamation: **Important**
->
-> your code should be written in `src/data_compression.py`
+:file_folder: Create a new file `src/data_compression.py`.

 The analysis of the occurrence frequencies of the characters in the languages shows that there is a strong variability between the characters. This shows that there is some "redundancy" in the languages. In other words, this means that it is possible to compress the languages by using these occurrence frequencies.

@@ -131,15 +132,14 @@ Basically speaking, the maximum level of compression that can be obtained with t
    If you want to learn more on the entropy concept, which has strong applications in compression, error correcting codes and cryptography, have a look on [this video](https://www.khanacademy.org/computing/computer-science/informationtheory/moderninfotheory/v/information-entropy) :)

    By assuming that the occurrence frequencies obtained of englishOF are representative of the occurrence probabilities of the characters in the english language, compute the entropy of the english language. For that :
-    1. create a new file `data_compression.py` in `src/`
-    2. add the line `from enhanced_occurrence_frequencies import read_text, compute_occurence_frequencies, detect_language` that allows you to use the functions defined in the previous file.
-    3. In order to be fair in the evaluation of the compression, you need to integrate all the possible characters, instead of just the letters. So, to generate the string characters which contains the considered characters, copy-paste the code :
+    1. add the line `from enhanced_occurrence_frequencies import read_text, compute_occurence_frequencies, detect_language` that allows you to use the functions defined in the previous file.
+    2. In order to be fair in the evaluation of the compression, you need to integrate all the possible characters, instead of just the letters. So, to generate the string characters which contains the considered characters, copy-paste the code :
    ```python
    characters = [chr(i) for i in range(128)]
    ```
    and re-compute the associated dictionaries

-    4. write a generic function that computes the entropy from any distribution probability represented by a dictionary.
+    3. write a generic function that computes the entropy from any distribution probability represented by a dictionary.
 2. Compute the entropies of french, spanish and german languages.
 3. The Huffman algorithm is a famous compression algorithm which is one of the components of current audio and video compression standards. A short presentation of this algorithm is given in [this video](https://www.khanacademy.org/computing/computer-science/informationtheory/moderninfotheory/v/compressioncodes). The objective here is not to program it (lack of time...) but rather to use an implementation. This implementation is provided in the file [`huffman.py`](src/huffman.py). You can find on this files the main steps of the algorithm :
    1. the construction of the tree according to the probabilities with `build_huffman_tree`

--- a/tests/test_compute_occurence_frequencies.py
+++ b/tests/test_compute_occurence_frequencies.py
-from src.enhanced_occurrence_frequencies import compute_occurence_frequencies
 from tests.constants import ALPHABET


 def test_type():
+    from src.enhanced_occurrence_frequencies import compute_occurence_frequencies
+
    actual = compute_occurence_frequencies("a", ALPHABET)
    assert isinstance(actual, dict), (
        "result of `compute_occurence_frequencies` should be a dictionary, "
@@ -12,6 +13,8 @@ def test_type():


 def test_output():
+    from src.enhanced_occurrence_frequencies import compute_occurence_frequencies
+
    actual = compute_occurence_frequencies(
        "this is some random text", ALPHABET
    )

--- a/tests/test_detect_language.py
+++ b/tests/test_detect_language.py
-from src.enhanced_occurrence_frequencies import (
-    compute_occurence_frequencies, read_text, detect_language
-)
 from tests.constants import ALPHABET, LANGUAGE_FILES


 def test_detect_language():
+    from src.enhanced_occurrence_frequencies import (
+        compute_occurence_frequencies, read_text, detect_language
+    )
+
    ofs = {
        lang: compute_occurence_frequencies(read_text(file), ALPHABET)
        for lang, file in LANGUAGE_FILES.items()

--- a/tests/test_entropy.py
+++ b/tests/test_entropy.py
-from src.data_compression import entropy
-from src.enhanced_occurrence_frequencies import (
-    read_text, compute_occurence_frequencies
-)
-
 from tests.constants import LANGUAGE_FILES, PRECISION, ASCII_ALPHABET

 ENTROPIES = {
@@ -14,6 +9,11 @@ ENTROPIES = {


 def test():
+    from src.data_compression import entropy
+    from src.enhanced_occurrence_frequencies import (
+        read_text, compute_occurence_frequencies
+    )
+
    for lang, file in LANGUAGE_FILES.items():
        e = entropy(
            compute_occurence_frequencies(read_text(file), ASCII_ALPHABET)

--- a/tests/test_kl_div.py
+++ b/tests/test_kl_div.py
-from src.enhanced_occurrence_frequencies import (
-    read_text, compute_occurence_frequencies, kl_divergence
-)
 from tests.constants import ALPHABET, LANGUAGE_FILES, PRECISION


 def test_kl_div():
+    from src.enhanced_occurrence_frequencies import (
+        read_text, compute_occurence_frequencies, kl_divergence
+    )
+
    try:
        kl_divergence({}, {'a': 1.0})
        raise Exception("should raise an error")

--- a/tests/test_read_text.py
+++ b/tests/test_read_text.py
+def test_read_text():
    from src.enhanced_occurrence_frequencies import read_text

-
-def test_read_text():
    assert read_text("tests/sample.txt") == "this is a sample text file\n"
    assert read_text("tests/empty.txt") == ""
    assert read_text("tests/multiline.txt") == "i\nam\na\nmultiline\ntext\n"