From 7cbaa3cfc3691147c28614783c3fc3ebfdc0b042 Mon Sep 17 00:00:00 2001 From: bd Date: Sun, 7 Jul 2024 01:54:05 -0600 Subject: Added huffman coding implementation and tests --- huffman-coding/huffman-test.scm | 179 ++++++++++++++++++++++++++++++++++++++++ huffman-coding/huffman.scm | 124 ++++++++++++++++++++++++++++ huffman-coding/pq.scm | 1 + 3 files changed, 304 insertions(+) create mode 100644 huffman-coding/huffman-test.scm create mode 100644 huffman-coding/huffman.scm create mode 120000 huffman-coding/pq.scm (limited to 'huffman-coding') diff --git a/huffman-coding/huffman-test.scm b/huffman-coding/huffman-test.scm new file mode 100644 index 0000000..b30c4ca --- /dev/null +++ b/huffman-coding/huffman-test.scm @@ -0,0 +1,179 @@ +;; -*- compile-command: "guile -L . huffman-test.scm"; -*- +(use-modules (srfi srfi-64) + (pq) + (huffman)) + +(define runner "a fast runner need never be afraid of the dark") +(define runner-opt-key '((#\v . "111001") (#\u . "111000") (#\t . "0100") (#\s . "111011") (#\r . "100") (#\o . "111010") (#\n . "1111") (#\k . "101101") (#\i . "101100") (#\h . "101111") (#\f . "1010") (#\e . "110") (#\d . "0101") (#\b . "101110") (#\a . "011") (#\space . "00"))) +(define abcde "A_DEAD_DAD_CEDED_A_BAD_BABE_A_BEADED_ABACA_BED") +(define abcde-opt-key '((#\_ . "01") (#\E . "110") (#\D . "00") (#\C . "1110") (#\B . "1111") (#\A . "10"))) +(define abcde-encoded "1001001101000010010000111101100011000011001111110000111111011111100110011111110100011000011011111011101001111111000") +(define (hash-table->alist ht) + (hash-map->list cons ht)) + +(define (node-pq->char-alist pq) + (map (lambda (ele) + (cons (node-char (car ele)) + (cdr ele))) + (car pq))) + +(define (extract-tree pq) + (car (pq-front pq))) + + +(test-begin "harness") + +(test-equal "hash-frequency zero chars, empty hash" + '() + (sort-rao (hash-table->alist (hash-frequency "")))) + +(test-equal "hash-frequency single 'c' key pair correct" + '((#\c . 1)) + (sort-rao (hash-table->alist (hash-frequency "c")))) + +(test-equal "hash-frequency double 'c' key pair correct" + '((#\c . 2)) + (sort-rao (hash-table->alist (hash-frequency "cc")))) + +(test-equal "hash-frequency couple letters key pair correct" + '((#\c . 2) (#\Y . 1)) + (sort-rao (hash-table->alist (hash-frequency "cYc")))) + +(test-equal "hash-frequency couple letters key pair correct" + '((#\c . 2) (#\Y . 1)) + (sort-rao (hash-table->alist (hash-frequency "cYc")))) + +(test-equal "hash-frequency abcde" + '((#\_ . 10) (#\E . 7) (#\D . 10) (#\C . 2) (#\B . 6) (#\A . 11)) + (sort-rao (hash-table->alist (hash-frequency abcde)))) + +(test-assert "frequencies->pq accepts hashtable and returns a pq" + (pq? (frequencies->pq (make-hash-table)))) + +(test-equal "frequencies->pq single entry" + '((#\a . -3)) + (node-pq->char-alist (frequencies->pq (hash-frequency "aaa")))) + +(test-equal "frequencies->pq double entry" + '((#\a . -3) (#\b . -3)) + (node-pq->char-alist (frequencies->pq (hash-frequency "aaabbb")))) + +(test-equal "frequencies->pq tie natural order" + '((#\a . -3) (#\b . -3) (#\c . -4)) + (node-pq->char-alist (frequencies->pq (hash-frequency "cabababccc")))) + +(test-equal "frequencies->pq tie reverse order" + '((#\a . -3) (#\b . -3) (#\c . -4)) + (node-pq->char-alist (frequencies->pq (hash-frequency "cbababaccc")))) + +(test-equal "frequencies->pq abcde" + '((#\C . -2) (#\B . -6) (#\E . -7) (#\D . -10) (#\_ . -10) (#\A . -11)) + (node-pq->char-alist (frequencies->pq (hash-frequency abcde)))) + +(let* ((pq (frequencies->pq (hash-frequency "aab"))) + (pqo (frequencies->pq (hash-frequency "aab"))) + (lc (car (pq-front pq))) + (rc (car (pq-rear pq))) + (p (begin (combine-and-reinsert! pq) + (car (pq-front pq))))) + + (test-assert "pq and pqo now different objects, but both pqs" + (and (not (equal? pq pqo)) + (pq? pq) + (pq? pqo))) + + (test-assert "combine-and-reinsert! left child is correct" + (equal? (node-left p) lc)) + + (test-assert "combine-and-reinsert! right child is correct" + (equal? (node-right p) rc)) + + (test-equal "combine-and-reinsert! correctly sums priority" + -3 + (cdr (pq-front pq)))) + +(let* ((pq (frequencies->pq (hash-frequency "aaabbbcccc")))) + (combine-and-reinsert! pq) + + (test-equal "combine-and-reinsert! first elements combined" + #\c + (node-char (car (pq-front pq)))) + + (test-equal "combine-and-reinsert! new node inserted in correct spot" + -6 + (cdr (pq-rear pq)))) + +(let ((pq (combine-all! (frequencies->pq (hash-frequency runner))))) + + (test-assert "combine-all! returns a pq" + (pq? pq)) + + (test-equal "combine-all! pq is of length one" + 1 + (pq-length pq)) + + (test-assert "combine-all! root item is a node" + (node? (car (pq-front pq)))) + + (test-equal "combine-all! root frequency matches length of str" + -46 + (cdr (pq-front pq)))) + +(test-equal "create-encodings level-1 tree" + '((#\a . "0")) + (sort-rao (create-encodings (extract-tree + (combine-all! + (frequencies->pq + (hash-frequency "a"))))))) + +;;;; note: verified manually: https://en.wikipedia.org/wiki/Huffman_coding +(test-equal "create-encodings abcde" + abcde-opt-key + (sort-rao (create-encodings (extract-tree + (combine-all! + (frequencies->pq + (hash-frequency abcde))))))) + +(test-equal "create-encodings runner" + runner-opt-key + (sort-rao (create-encodings (extract-tree + (combine-all! + (frequencies->pq + (hash-frequency runner))))))) + +(let ((none (encode-text ""))) + + (test-assert "encode-text nothing empty string" + (string-null? (car none))) + + (test-assert "encode-text nothing empty alist" + (null? (cdr none)))) + +(let ((limit (encode-text abcde))) + + (test-equal "encode-text abcde string" + abcde-encoded + (car limit)) + + (test-equal "encode-text abcde alist" + abcde-opt-key + (sort-rao (cdr limit)))) + +(test-equal "decode single char" + "x" + (decode-text "00011" '((#\x . "00011") (#\y . "001")))) + +(test-equal "decode two chars" + "xq" + (decode-text "00011" '((#\x . "000") (#\y . "011") (#\q . "11")))) + +(test-equal "decode abcde" + abcde + (decode-text abcde-encoded abcde-opt-key)) + +(test-error "decode abcde nonsense key" + #t + (decode-text abcde-encoded runner-opt-key)) + + +(test-end "harness") diff --git a/huffman-coding/huffman.scm b/huffman-coding/huffman.scm new file mode 100644 index 0000000..392ee4c --- /dev/null +++ b/huffman-coding/huffman.scm @@ -0,0 +1,124 @@ +(define-module (huffman) + #:use-module (srfi srfi-9) + #:use-module (pq) ;; my priority queue implementation + #:export (make-node + node? + node-char + node-left + node-right + decode-text + encode-text + create-encodings + combine-all! + combine-and-reinsert! + frequencies->pq + hash-frequency + sort-rao)) + + +(define-record-type + (make-node char left right) + node? + (char node-char) + (left node-left set-node-left!) + (right node-right set-node-right!)) + +(define (decode-text str keys) + "Does the opposite of encode-text." + (let ((patterns (map (lambda (pair) (cons (cdr pair) (car pair))) + keys))) + (let loop ((lst (string->list str)) (cur '()) (res '())) + (let ((decoded-char (assoc-ref patterns (list->string (reverse cur))))) + (cond + ((null? lst) + (if decoded-char + (list->string (reverse (cons decoded-char res))) + (throw 'mismatched-encoding-key str keys))) + (decoded-char (loop (cdr lst) (cons (car lst) '()) (cons decoded-char res))) + (#t (loop (cdr lst) (cons (car lst) cur) res))))))) + +(define (encode-text str) + "Given a STR, returns the encoded string, +alongside the used encoding scheme." + (let ((hash-pq (frequencies->pq (hash-frequency str)))) + (if (equal? (pq-length hash-pq) 0) + (cons "" '()) + (let ((patterns (create-encodings + (car (pq-front + (combine-all! hash-pq)))))) + (let loop ((lst (string->list str)) + (res "")) + (if (null? lst) + (cons res patterns) + (loop (cdr lst) + (string-append res (assoc-ref patterns (car lst)))))))))) + +(define (create-encodings root) + "Given node ROOT, traverses the binary tree, +constructing bit patterns for each letter." + (let loop ((str "") + (tree root)) + (if (node-char tree) + (acons (node-char tree) (if (string-null? str) "0" str) '()) + (append (loop (string-append str "0") (node-left tree)) + (loop (string-append str "1") (node-right tree)))))) + +(define (combine-all! pq) + "Given priority PQ, builds an encoding tree +by recursively combining high-priority nodes." + (if (>= 1 (pq-length pq)) + pq + (begin (combine-and-reinsert! pq) + (combine-all! pq)))) + +(define (combine-and-reinsert! pq) + "Given priority queue PQ, retrieves the two +largest-priority elements, combines, and reinserts +them." + (let* ((e1 (pq-pop! pq)) + (e2 (pq-pop! pq)) + (c1 (car e1)) + (c2 (car e2)) + (p1 (make-node #f c1 c2))) + (pq-push! pq p1 (+ (cdr e1) (cdr e2))))) + +(define (frequencies->pq ht) + "Given hash table HT of characters +corresponding to frequencies, returns +a frequency and alphabetically sorted +priority queue of nodes. + +Negates the frequency, because we want +to pull the smallest numbers first." + (let loop ((els (sort-rao (hash-map->list cons ht))) + (pq (make-pq))) + (if (null? els) + pq + (begin + (pq-push! pq (make-node (caar els) #f #f) (- (cdar els))) + (loop (cdr els) pq))))) + +(define (hash-frequency str) + "Given STR, returns a hash table of +each character, corresponding to the number +of times it appears in STR." + (let ((dict (make-hash-table))) + (let loop ((chrs (string->list str))) + (if (null? chrs) + dict + (begin (hashq-set! dict (car chrs) (1+ (dict-occur-ref dict (car chrs)))) + (loop (cdr chrs))))))) + +(define (dict-occur-ref dict e) + "A wrapper for hashq-ref. Returns '0' if an element +is not present, rather than '#f'." + (or (hashq-ref dict e) 0)) + +(define (sort-rao lst) + "Helper for frequencies->pq. + +Given an alist, sorts it in +reverse alphabetical order." + (sort lst + (lambda (a b) + (char>? (car a) (car b))))) diff --git a/huffman-coding/pq.scm b/huffman-coding/pq.scm new file mode 120000 index 0000000..11bf900 --- /dev/null +++ b/huffman-coding/pq.scm @@ -0,0 +1 @@ +../priority-queue/pq.scm \ No newline at end of file -- cgit v1.2.3