Skip to content
Snippets Groups Projects
Commit 308e7019 authored by STEVAN Antoine's avatar STEVAN Antoine :crab:
Browse files

improve the class material (mae-ac/language-detection!2)

- add some better emojis to the README
- move the test imports inside the test functions: because the students will have to write functions that do not exist at the start of the class, the error message are pretty obscur
```
pytest
============================= test session starts ==============================
platform linux -- Python 3.10.12, pytest-8.2.2, pluggy-1.5.0
rootdir: /home/disc/a.stevan/documents/repos/gitlab.isae-supaero.fr/mae/language-detection
collected 0 items / 5 errors

==================================== ERRORS ====================================
_________ ERROR collecting tests/test_compute_occurence_frequencies.py _________
ImportError while importing test module '/home/disc/a.stevan/documents/repos/gitlab.isae-supaero.fr/mae/language-detection/tests/test_compute_occurence_frequencies.py'.
Hint: make sure your test modules/packages have valid Python names.
Traceback:
/usr/lib/python3.10/importlib/__init__.py:126: in import_module
    return _bootstrap._gcd_import(name[level:], package, level)
tests/test_compute_occurence_frequencies.py:1: in <module>
    from src.enhanced_occurrence_frequencies import compute_occurence_frequencies
E   ModuleNotFoundError: No module named 'src.enhanced_occurrence_frequencies'
________________ ERROR collecting tests/test_detect_language.py ________________
ImportError while importing test module '/home/disc/a.stevan/documents/repos/gitlab.isae-supaero.fr/mae/language-detection/tests/test_detect_language.py'.
Hint: make sure your test modules/packages have valid Python names.
Traceback:
/usr/lib/python3.10/importlib/__init__.py:126: in import_module
    return _bootstrap._gcd_import(name[level:], package, level)
tests/test_detect_language.py:1: in <module>
    from src.enhanced_occurrence_frequencies import (
E   ModuleNotFoundError: No module named 'src.enhanced_occurrence_frequencies'
____________________ ERROR collecting tests/test_entropy.py ____________________
ImportError while importing test module '/home/disc/a.stevan/documents/repos/gitlab.isae-supaero.fr/mae/language-detection/tests/test_entropy.py'.
Hint: make sure your test modules/packages have valid Python names.
Traceback:
/usr/lib/python3.10/importlib/__init__.py:126: in import_module
    return _bootstrap._gcd_import(name[level:], package, level)
tests/test_entropy.py:1: in <module>
    from src.data_compression import entropy
E   ModuleNotFoundError: No module named 'src.data_compression'
____________________ ERROR collecting tests/test_kl_div.py _____________________
ImportError while importing test module '/home/disc/a.stevan/documents/repos/gitlab.isae-supaero.fr/mae/language-detection/tests/test_kl_div.py'.
Hint: make sure your test modules/packages have valid Python names.
Traceback:
/usr/lib/python3.10/importlib/__init__.py:126: in import_module
    return _bootstrap._gcd_import(name[level:], package, level)
tests/test_kl_div.py:1: in <module>
    from src.enhanced_occurrence_frequencies import (
E   ModuleNotFoundError: No module named 'src.enhanced_occurrence_frequencies'
___________________ ERROR collecting tests/test_read_text.py ___________________
ImportError while importing test module '/home/disc/a.stevan/documents/repos/gitlab.isae-supaero.fr/mae/language-detection/tests/test_read_text.py'.
Hint: make sure your test modules/packages have valid Python names.
Traceback:
/usr/lib/python3.10/importlib/__init__.py:126: in import_module
    return _bootstrap._gcd_import(name[level:], package, level)
tests/test_read_text.py:1: in <module>
    from src.enhanced_occurrence_frequencies import read_text
E   ModuleNotFoundError: No module named 'src.enhanced_occurrence_frequencies'
=========================== short test summary info ============================
ERROR tests/test_compute_occurence_frequencies.py
ERROR tests/test_detect_language.py
ERROR tests/test_entropy.py
ERROR tests/test_kl_div.py
ERROR tests/test_read_text.py
!!!!!!!!!!!!!!!!!!! Interrupted: 5 errors during collection !!!!!!!!!!!!!!!!!!!!
============================== 5 errors in 0.04s ===============================
```
now the errors will show
```
pytest
============================= test session starts ==============================
platform linux -- Python 3.10.12, pytest-8.2.2, pluggy-1.5.0
rootdir: /home/disc/a.stevan/documents/repos/gitlab.isae-supaero.fr/mae/language-detection
collected 6 items

tests/test_compute_occurence_frequencies.py FF                           [ 33%]
tests/test_detect_language.py F                                          [ 50%]
tests/test_entropy.py F                                                  [ 66%]
tests/test_kl_div.py F                                                   [ 83%]
tests/test_read_text.py F                                                [100%]

=================================== FAILURES ===================================
__________________________________ test_type ___________________________________

    def test_type():
>       from src.enhanced_occurrence_frequencies import compute_occurence_frequencies
E       ModuleNotFoundError: No module named 'src.enhanced_occurrence_frequencies'

tests/test_compute_occurence_frequencies.py:5: ModuleNotFoundError
_________________________________ test_output __________________________________

    def test_output():
>       from src.enhanced_occurrence_frequencies import compute_occurence_frequencies
E       ModuleNotFoundError: No module named 'src.enhanced_occurrence_frequencies'

tests/test_compute_occurence_frequencies.py:16: ModuleNotFoundError
_____________________________ test_detect_language _____________________________

    def test_detect_language():
>       from src.enhanced_occurrence_frequencies import (
            compute_occurence_frequencies, read_text, detect_language
        )
E       ModuleNotFoundError: No module named 'src.enhanced_occurrence_frequencies'

tests/test_detect_language.py:5: ModuleNotFoundError
_____________________________________ test _____________________________________

    def test():
>       from src.data_compression import entropy
E       ModuleNotFoundError: No module named 'src.data_compression'

tests/test_entropy.py:12: ModuleNotFoundError
_________________________________ test_kl_div __________________________________

    def test_kl_div():
>       from src.enhanced_occurrence_frequencies import (
            read_text, compute_occurence_frequencies, kl_divergence
        )
E       ModuleNotFoundError: No module named 'src.enhanced_occurrence_frequencies'

tests/test_kl_div.py:5: ModuleNotFoundError
________________________________ test_read_text ________________________________

    def test_read_text():
>       from src.enhanced_occurrence_frequencies import read_text
E       ModuleNotFoundError: No module named 'src.enhanced_occurrence_frequencies'

tests/test_read_text.py:2: ModuleNotFoundError
=========================== short test summary info ============================
FAILED tests/test_compute_occurence_frequencies.py::test_type - ModuleNotFoun...
FAILED tests/test_compute_occurence_frequencies.py::test_output - ModuleNotFo...
FAILED tests/test_detect_language.py::test_detect_language - ModuleNotFoundEr...
FAILED tests/test_entropy.py::test - ModuleNotFoundError: No module named 'sr...
FAILED tests/test_kl_div.py::test_kl_div - ModuleNotFoundError: No module nam...
FAILED tests/test_read_text.py::test_read_text - ModuleNotFoundError: No modu...
============================== 6 failed in 0.02s ===============================
```
parent b9489f80
Branches
No related tags found
No related merge requests found
......@@ -44,19 +44,21 @@ make test
Your first objective is to find the occurrence frequencies of the letters of the alphabet in a text.
Open the [`occurrence_frequencies.py`](src/occurrence_frequencies.py) script in your editor of choice and fill it. This file already contains the steps to follow :
:pencil: Open the [`occurrence_frequencies.py`](src/occurrence_frequencies.py) script in your editor
of choice and fill it. This file already contains the steps to follow :
- Create a dictionary containing the letters with the occurrences equal to 0
- For each letter in the text, increment the corresponding entry of the dictionary
- Normalize the values of the dictionary in order to have frequencies (the sum is equal to 1)
## Q2. Enhanced occurrence frequency analysis [[toc](#table-of-content)]
> :exclamation: **Important**
>
> there are tests that should pass
:file_folder: Create a new file called `enhanced_occurrence_frequencies.py` in the `src/` directory.
:gear: you can run the `make test` command and see 6 failing tests. These tests should hopefully
pass at the end of this question!
Now we will use the [_Wuthering Heights_ by Emily Bronte](data/english_wuthering_heights.txt) text and analyze it.
You should now be able to answer the following questions. Your work must be written in a new Python program, called `enhanced_occurrence_frequencies.py` in the `src` directory:
:pencil: You should now be able to answer the following questions:
1. Write a function `read_text` to read a text file and return a string text containing only lower case characters.
> - arguments:
> 1. filename: `str`
......@@ -75,11 +77,13 @@ You should now be able to answer the following questions. Your work must be writ
$$D_{KL}(P,Q)=\sum_{i=1}^{n} p_i \log_2 \frac{p_i}{q_i}$$
> :bulb: **Note**
>
> All the $q_i$ must be strictly positive. If some of them are equal to zero, do not consider the corresponding symbol.
Since the KL divergence is not symmetric, you can symmetrize it by computing the absolute value of the average of $D_{KL}(P,Q)$ and $D_{KL}(Q,P)$.
> :bulb: **Note**
>
> You need the `log` and the `fabs` functions. Since there are not loaded by default, you must add the line `from math import log,fabs` at the beginning of your file.
Write a function `kl_divergence` that takes two probability distributions in the form of previously computed dictionaries and returns the symmetric _Kullback-Leibler_ divergence.
......@@ -99,9 +103,7 @@ Write a function `kl_divergence` that takes two probability distributions in the
---
## Q3. Influence of the text length on the quality of detection [[toc](#table-of-content)]
> :exclamation: **Important**
>
> your code should be written in `src/text_length.py`
:file_folder: Create a new file `src/text_length.py`.
The accuracy of the language detector depends on the length of the input text. The objective of this exercise is to evaluate this accuracy according to the length of the text.
......@@ -109,6 +111,7 @@ The accuracy of the language detector depends on the length of the input text. T
For each text length, evaluate the probability of good detection and plot the result with `matplotlib`.
> :bulb: **Note**
>
> the details of this question have been intentionally left as an exercise to the students.
> it's time for you to shine and experiment what you think would be a good approach.
>
......@@ -118,9 +121,7 @@ For each text length, evaluate the probability of good detection and plot the re
To try improve this result, you can try to capture the redundancy of a language in the sequences of 2 (or more) letters. For that, consider the pairs of consecutive letters as a symbol (of course, the size of your alphabet of sybols will increase) and do the same analysis to check if you can improve the detection results.
## Q4. Compression with Huffman [[toc](#table-of-content)]
> :exclamation: **Important**
>
> your code should be written in `src/data_compression.py`
:file_folder: Create a new file `src/data_compression.py`.
The analysis of the occurrence frequencies of the characters in the languages shows that there is a strong variability between the characters. This shows that there is some "redundancy" in the languages. In other words, this means that it is possible to compress the languages by using these occurrence frequencies.
......@@ -131,15 +132,14 @@ Basically speaking, the maximum level of compression that can be obtained with t
If you want to learn more on the entropy concept, which has strong applications in compression, error correcting codes and cryptography, have a look on [this video](https://www.khanacademy.org/computing/computer-science/informationtheory/moderninfotheory/v/information-entropy) :)
By assuming that the occurrence frequencies obtained of englishOF are representative of the occurrence probabilities of the characters in the english language, compute the entropy of the english language. For that :
1. create a new file `data_compression.py` in `src/`
2. add the line `from enhanced_occurrence_frequencies import read_text, compute_occurence_frequencies, detect_language` that allows you to use the functions defined in the previous file.
3. In order to be fair in the evaluation of the compression, you need to integrate all the possible characters, instead of just the letters. So, to generate the string characters which contains the considered characters, copy-paste the code :
1. add the line `from enhanced_occurrence_frequencies import read_text, compute_occurence_frequencies, detect_language` that allows you to use the functions defined in the previous file.
2. In order to be fair in the evaluation of the compression, you need to integrate all the possible characters, instead of just the letters. So, to generate the string characters which contains the considered characters, copy-paste the code :
```python
characters = [chr(i) for i in range(128)]
```
and re-compute the associated dictionaries
4. write a generic function that computes the entropy from any distribution probability represented by a dictionary.
3. write a generic function that computes the entropy from any distribution probability represented by a dictionary.
2. Compute the entropies of french, spanish and german languages.
3. The Huffman algorithm is a famous compression algorithm which is one of the components of current audio and video compression standards. A short presentation of this algorithm is given in [this video](https://www.khanacademy.org/computing/computer-science/informationtheory/moderninfotheory/v/compressioncodes). The objective here is not to program it (lack of time...) but rather to use an implementation. This implementation is provided in the file [`huffman.py`](src/huffman.py). You can find on this files the main steps of the algorithm :
1. the construction of the tree according to the probabilities with `build_huffman_tree`
......
from src.enhanced_occurrence_frequencies import compute_occurence_frequencies
from tests.constants import ALPHABET
def test_type():
from src.enhanced_occurrence_frequencies import compute_occurence_frequencies
actual = compute_occurence_frequencies("a", ALPHABET)
assert isinstance(actual, dict), (
"result of `compute_occurence_frequencies` should be a dictionary, "
......@@ -12,6 +13,8 @@ def test_type():
def test_output():
from src.enhanced_occurrence_frequencies import compute_occurence_frequencies
actual = compute_occurence_frequencies(
"this is some random text", ALPHABET
)
......
from src.enhanced_occurrence_frequencies import (
compute_occurence_frequencies, read_text, detect_language
)
from tests.constants import ALPHABET, LANGUAGE_FILES
def test_detect_language():
from src.enhanced_occurrence_frequencies import (
compute_occurence_frequencies, read_text, detect_language
)
ofs = {
lang: compute_occurence_frequencies(read_text(file), ALPHABET)
for lang, file in LANGUAGE_FILES.items()
......
from src.data_compression import entropy
from src.enhanced_occurrence_frequencies import (
read_text, compute_occurence_frequencies
)
from tests.constants import LANGUAGE_FILES, PRECISION, ASCII_ALPHABET
ENTROPIES = {
......@@ -14,6 +9,11 @@ ENTROPIES = {
def test():
from src.data_compression import entropy
from src.enhanced_occurrence_frequencies import (
read_text, compute_occurence_frequencies
)
for lang, file in LANGUAGE_FILES.items():
e = entropy(
compute_occurence_frequencies(read_text(file), ASCII_ALPHABET)
......
from src.enhanced_occurrence_frequencies import (
read_text, compute_occurence_frequencies, kl_divergence
)
from tests.constants import ALPHABET, LANGUAGE_FILES, PRECISION
def test_kl_div():
from src.enhanced_occurrence_frequencies import (
read_text, compute_occurence_frequencies, kl_divergence
)
try:
kl_divergence({}, {'a': 1.0})
raise Exception("should raise an error")
......
def test_read_text():
from src.enhanced_occurrence_frequencies import read_text
def test_read_text():
assert read_text("tests/sample.txt") == "this is a sample text file\n"
assert read_text("tests/empty.txt") == ""
assert read_text("tests/multiline.txt") == "i\nam\na\nmultiline\ntext\n"
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment