add source files

90419019 · STEVAN Antoine · edefeee9 · 90419019 · 90419019 · 90419019
Unverified Commit 90419019 authored Jun 11, 2024 by STEVAN Antoine
--- a/src/huffman.py
+++ b/src/huffman.py
+import random
+
+def build_huffman_tree(letter_count):
+    """ recieves dictionary with char:count entries
+        generates a LIST structure representing
+        the binary Huffman encoding tree"""
+    queue = [(x, px) for x,px in letter_count.items()]
+    while len(queue) > 1:
+    # combine two smallest elements
+        a, pa = extract_min(queue)   # smallest in queue
+        b, pb = extract_min(queue)   # next smallest
+        chars = [a,b]
+        weight = pa+pb # combined weight
+        queue.append((chars,weight)) # insert new node
+        #print(queue)   # to see what whole queue is
+        #print()
+    x, px = extract_min(queue) # only root node left 
+    return x
+
+def extract_min(queue):
+    P = [px for x,px in queue]
+    return queue.pop(P.index(min(P)))
+
+def generate_code(huff_tree, prefix=""):
+    """ receives a Huffman tree with embedded encoding,
+        and a prefix of encodings.
+        returns a dictionary where characters are
+        keys and associated binary strings are values."""
+    if isinstance(huff_tree, str): # a leaf
+        return {huff_tree: prefix}
+    else:
+        lchild, rchild = huff_tree[0], huff_tree[1]
+        codebook = {}
+
+        codebook.update( generate_code(lchild, prefix+'0'))
+        codebook.update( generate_code(rchild, prefix+'1'))
+        return codebook
+    
+def compress(text, encoding_dict):
+  """ compress text using encoding dictionary """
+  assert isinstance(text, str)
+  return "".join(encoding_dict[ch] for ch in text if ord(ch)<128)
+
+
+def build_decoding_dict(encoding_dict):
+   """build the "reverse" of encoding dictionary"""
+   return {y:x for (x,y) in encoding_dict.items()}
+  # return {y:x for x,y in encoding_dict.items()} # OK too
+
+
+def decompress(bits, decoding_dict):
+   prefix = ""
+   result = []
+   for bit in bits:
+      prefix += bit
+      if prefix in decoding_dict:
+          result.append(decoding_dict[prefix])
+          prefix = ""
+   assert prefix == "" # must finish last codeword
+   return "".join(result)  # converts list of chars to a string
+
+
--- a/src/lempel_ziv.py
+++ b/src/lempel_ziv.py
+# -*- coding: utf-8 -*-
+
+##########
+# original version downloaded from
+# https://gist.github.com/BertrandBordage/  
+# in december 2019
+##########
+
+
+from math import floor, ceil
+from typing import AnyStr # use a particular typing 
+
+
+ASCII_TO_INT: dict = {i.to_bytes(1, 'big'): i for i in range(256)}
+INT_TO_ASCII: dict = {i: b for b, i in ASCII_TO_INT.items()}
+
+
+def compressLZ(data: AnyStr) -> bytes:
+    if isinstance(data, str):
+        data = data.encode()
+    keys: dict = ASCII_TO_INT.copy()
+    n_keys: int = 256
+    compressed: list = []
+    start: int = 0
+    n_data: int = len(data)+1
+    while True:
+        if n_keys >= 512:
+            keys = ASCII_TO_INT.copy()
+            n_keys = 256
+        for i in range(1, n_data-start):
+            w: bytes = data[start:start+i]
+            if w not in keys:
+                compressed.append(keys[w[:-1]])
+                keys[w] = n_keys
+                start += i-1
+                n_keys += 1
+                break
+        else:
+            compressed.append(keys[w])
+            break
+    bits: str = ''.join([bin(i)[2:].zfill(9) for i in compressed])
+    return int(bits, 2).to_bytes(ceil(len(bits) / 8), 'big')
+
+
+def decompressLZ(data: AnyStr) -> bytes:
+    if isinstance(data, str):
+        data = data.encode()
+    keys: dict = INT_TO_ASCII.copy()
+    bits: str = bin(int.from_bytes(data, 'big'))[2:].zfill(len(data) * 8)
+    n_extended_bytes: int = floor(len(bits) / 9)
+    bits: str = bits[-n_extended_bytes * 9:]
+    data_list: list = [int(bits[i*9:(i+1)*9], 2)
+                       for i in range(n_extended_bytes)]
+    previous: bytes = keys[data_list[0]]
+    uncompressed: list = [previous]
+    n_keys: int = 256
+    for i in data_list[1:]:
+        if n_keys >= 512:
+            keys = INT_TO_ASCII.copy()
+            n_keys = 256
+        try:
+            current: bytes = keys[i]
+        except KeyError:
+            current = previous + previous[:1]
+        uncompressed.append(current)
+        keys[n_keys] = previous + current[:1]
+        previous = current
+        n_keys += 1
+    return b''.join(uncompressed)
+
+
+
--- a/src/occurrenceFrequencies.py
+++ b/src/occurrenceFrequencies.py
+# -*- coding: utf-8 -*-
+"""
+Created on "write the date here, please"
+@author: "write your name here please"
+"""
+
+
+text = "I WENT AND CALLED, BUT GOT NO ANSWER. ON RETURNING, I WHISPERED TO CATHERINE THAT HE HAD HEARD A GOOD PART OF WHAT SHE SAID, I WAS SURE; ANDTOLD HOW I SAW HIM QUIT THE KITCHEN JUST AS SHE COMPLAINED OF HERBROTHER'S CONDUCT REGARDING HIM.  SHE JUMPED UP IN A FINE FRIGHT, FLUNG HARETON ON TO THE SETTLE, AND RAN TO SEEK FOR HER FRIEND HERSELF; NOT TAKING LEISURE TO CONSIDER WHY SHE WAS SO FLURRIED, OR HOW HER TALK WOULD HAVE AFFECTED HIM.  SHE WAS ABSENT SUCH A WHILE THAT JOSEPH PROPOSED WE SHOULD WAIT NO LONGER.  HE CUNNINGLY CONJECTURED THEY WERE STAYING AWAY IN ORDER TO AVOID HEARING HIS PROTRACTED BLESSING."
+print(text)
+
+# letters
+letters ="ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+
+# PROBLEM : find the occurence frequencies of the letters of the alphabet in the following text
+
+
+# We can decompose the problem as follows :
+#   1- create a dictionary containing the letters with the occurrences equal to 0 
+#   2- for each letter in the text, increment the corresponding entry of the dictionary
+#   3- normalize the values of the dictionary in order to have frequencies (the sum is equal to 1)