We present novel online approximations of the Lempel-Ziv 77 (LZ77) and Lempel-Ziv 78 (LZ78) compression schemes [Lempel & Ziv, 1977/1978] with parameterizable space usage based on estimating which k patterns occur the most frequently in the streamed input for parameter k. This new approach overcomes the issue of finding only local repetitions, which is a natural limitation of algorithms that compress using a sliding window or by partitioning the input into blocks. For this, we introduce the top-k trie, a summary for maintaining online the top-k frequent consecutive patterns in a stream of characters based on a combination of the Lempel-Ziv 78 compression scheme and the Misra-Gries algorithm for frequent item estimation in streams. Using straightforward encoding, our implementations yield compression ratios (output over input size) competitive with established general-purpose LZ-based compression utilities such as gzip or xz.
@InProceedings{dinklage_et_al:LIPIcs.SEA.2024.9, author = {Dinklage, Patrick and Fischer, Johannes and Prezza, Nicola}, title = {{Top-k Frequent Patterns in Streams and Parameterized-Space LZ Compression}}, booktitle = {22nd International Symposium on Experimental Algorithms (SEA 2024)}, pages = {9:1--9:20}, series = {Leibniz International Proceedings in Informatics (LIPIcs)}, ISBN = {978-3-95977-325-6}, ISSN = {1868-8969}, year = {2024}, volume = {301}, editor = {Liberti, Leo}, publisher = {Schloss Dagstuhl -- Leibniz-Zentrum f{\"u}r Informatik}, address = {Dagstuhl, Germany}, URL = {https://drops.dagstuhl.de/entities/document/10.4230/LIPIcs.SEA.2024.9}, URN = {urn:nbn:de:0030-drops-203748}, doi = {10.4230/LIPIcs.SEA.2024.9}, annote = {Keywords: compression, streaming, heavy hitters, algorithm engineering} }
Feedback for Dagstuhl Publishing