Skip to content
Snippets Groups Projects
Unverified Commit c3c9a564 authored by STEVAN Antoine's avatar STEVAN Antoine :crab:
Browse files

remove Lempel-Ziv and "arithmetic coding"

they do not work
parent d5a48dae
Branches
No related tags found
No related merge requests found
......@@ -156,7 +156,6 @@ Basically speaking, the maximum level of compression that can be obtained with t
1. Evaluate the compression ratio by considering that each character of the text files is encoded on 7 bits. Thus the compression rate is given by :
$$\frac{7\times \textrm{number of characters in the initial text file}}{\textrm{number of characters in the compressed file}}$$
4. How to improve the level of compression ? The occurrence frequencies of each character only capture the redundancy of the characters, but not the one between consecutive characters... So, you can do the same exercice by analyzing the occurence frequencies of all 2-letter (or more) words. This should allow you to capture more redundancy of the text and then to improve the compression ratio.
5. If you have finish, compare the Huffman performance with those of other compression algorithms [_Arithmetic Coding_](src/arithmetic_coding.py) and [_Lempel-Ziv_](src/lempel_ziv.py).
## Compress the files et [upload them to the LMS](https://lms.isae.fr/mod/assign/view.php?id=107859) [[toc](#table-of-content)]
- [ ] `src/occurrence_frequencies.py`
......
##########
# original version downloaded from
# https://rosettacode.org/wiki/Arithmetic_coding/As_a_generalized_change_of_radix
# in december 2019
##########
from collections import Counter
def cumulative_freq(freq):
cf = {}
total = 0
for b in range(256):
if b in freq:
cf[b] = total
total += freq[b]
return cf
def arithmethic_coding(toBeEncoded, textRef, radix):
# toBeEncoded : the string to encode
# textRef : the string which is used to build the frequency distribution
# radix : the radix for the encoding
# The frequency characters
freq = Counter(textRef)
print(freq)
# The cumulative frequency table
cf = cumulative_freq(freq)
print('cf=',cf)
# Base
base = len(toBeEncoded)
# Lower bound
lower = 0
# Product of all frequencies
pf = 1
# Each term is multiplied by the product of the
# frequencies of all previously occurring symbols
for b in toBeEncoded:
lower = lower*base + cf[b]*pf
pf *= freq[b]
# Upper bound
upper = lower+pf
pow = 0
while True:
pf //= radix
if pf==0: break
pow += 1
enc = (upper-1) // radix**pow
return enc, pow, freq
def arithmethic_decoding(enc, radix, pow, freq):
# Multiply enc by radix^pow
enc *= radix**pow;
# Base
base = sum(freq.values())
# Create the cumulative frequency table
cf = cumulative_freq(freq)
# Create the dictionary
dict = {}
for k,v in cf.items():
dict[v] = k
# Fill the gaps in the dictionary
lchar = None
for i in range(base):
if i in dict:
lchar = dict[i]
elif lchar is not None:
dict[i] = lchar
# Decode the input number
decoded = bytearray()
for i in range(base-1, -1, -1):
pow = base**i
div = enc//pow
c = dict[div]
fv = freq[c]
cv = cf[c]
rem = (enc - pow*cv) // fv
enc = rem
decoded.append(c)
# Return the decoded output
return bytes(decoded)
radix = 2 # can be any integer greater or equal with 2
# exemple
str='DABDDB DABDDBBDDBA ABRACADABRA TOBEORNOTTOBEORTOBEORNOT'
str = str.encode() # used to handle the string by bytes
enc, pow, freq = arithmethic_coding(str, str, radix)
dec = arithmethic_decoding(enc, radix, pow, freq)
print("%-25s=> %19s * %d^%s" % (str, enc, radix, pow))
if str != dec:
raise Exception("\tHowever that is incorrect!")
else:
print("yes, Bro.")
# -*- coding: utf-8 -*-
##########
# original version downloaded from
# https://gist.github.com/BertrandBordage/
# in december 2019
##########
from math import floor, ceil
from typing import AnyStr # use a particular typing
ASCII_TO_INT: dict = {i.to_bytes(1, 'big'): i for i in range(256)}
INT_TO_ASCII: dict = {i: b for b, i in ASCII_TO_INT.items()}
def compressLZ(data: AnyStr) -> bytes:
if isinstance(data, str):
data = data.encode()
keys: dict = ASCII_TO_INT.copy()
n_keys: int = 256
compressed: list = []
start: int = 0
n_data: int = len(data)+1
while True:
if n_keys >= 512:
keys = ASCII_TO_INT.copy()
n_keys = 256
for i in range(1, n_data-start):
w: bytes = data[start:start+i]
if w not in keys:
compressed.append(keys[w[:-1]])
keys[w] = n_keys
start += i-1
n_keys += 1
break
else:
compressed.append(keys[w])
break
bits: str = ''.join([bin(i)[2:].zfill(9) for i in compressed])
return int(bits, 2).to_bytes(ceil(len(bits) / 8), 'big')
def decompressLZ(data: AnyStr) -> bytes:
if isinstance(data, str):
data = data.encode()
keys: dict = INT_TO_ASCII.copy()
bits: str = bin(int.from_bytes(data, 'big'))[2:].zfill(len(data) * 8)
n_extended_bytes: int = floor(len(bits) / 9)
bits: str = bits[-n_extended_bytes * 9:]
data_list: list = [int(bits[i*9:(i+1)*9], 2)
for i in range(n_extended_bytes)]
previous: bytes = keys[data_list[0]]
uncompressed: list = [previous]
n_keys: int = 256
for i in data_list[1:]:
if n_keys >= 512:
keys = INT_TO_ASCII.copy()
n_keys = 256
try:
current: bytes = keys[i]
except KeyError:
current = previous + previous[:1]
uncompressed.append(current)
keys[n_keys] = previous + current[:1]
previous = current
n_keys += 1
return b''.join(uncompressed)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment