#unicode #binary #codec #base256 #text-encoding

no-std base256u

Simple mapping between bytes and Unicode codepoints

4 stable releases

1.1.1 Feb 21, 2024
1.0.1 Feb 21, 2024
1.0.0 Jun 17, 2022

#370 in Encoding


Used in base256u-cli

MPL-2.0 license

17KB
191 lines

base256u

Just a simple Rust crate and CLI program to map between bytes and unicode glyphs. Includes reference printable-ascii-preserved Unicode (papu) encoder and decoder functions, as well as emoji ones. The papu encoding will preserve all text that is already only printable ascii characters and all the other bytes map to single-codepoint non-combining printable glyphs, skipping odd things like NBSP and SHY.

The CLI

$ cargo install base256u-cli
...

$ python -c 'import sys; sys.stdout.buffer.write(bytes(list(range(256))))' | base256u
°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏ !"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJK
LMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~§ĀāĂ㥹ĆćĈĉĊċČčĎďĐđĒēĔĕĖė
ĘęĚěĜĝĞğĠġĢģĤĥĦħĨĩĪīĬĭĮįİıIJijĴĵĶķĸĹĺĻļĽľĿŀŁłŃńŅņŇň¤ŊŋŌōŎŏŐőŒœŔŕŖŗŘřŚśŜŝŞşŠšŢţ
ŤťŦŧŨũŪūŬŭŮůŰűŲųŴŵŶŷŸŹźŻżŽžſ

$ python -c 'import sys; sys.stdout.buffer.write(bytes(list(range(256))))' | base256u -ew 36
🌰🌱🌲🌳🌴🌵🌶🌷🌸🌹🌺🌻🌼🌽🌾🌿🍀🍁🍂🍃🍄🍅🍆🍇🍈🍉🍊🍋🍌🍍🍎🍏🍐🍑🍒🍓
🍔🍕🍖🍗🍘🍙🍚🍛🍜🍝🍞🍟🍠🍡🍢🍣🍤🍥🍦🍧🍨🍩🍪🍫🍬🍭🍮🍯🍰🍱🍲🍳🍴🍵🍶🍷
🍸🍹🍺🍻🍼🍽🍾🍿🐀🐁🐂🐃🐄🐅🐆🐇🐈🐉🐊🐋🐌🐍🐎🐏🐐🐑🐒🐓🐔🐕🐖🐗🐘🐙🐚🐛
🐜🐝🐞🐟🐠🐡🐢🐣🐤🐥🐦🐧🐨🐩🐪🐫🐬🐭🐮🐯🐰🐱🐲🐳🐴🐵🐶🐷🐸🐹🐺🐻🐼🐽🐾🐿
😀😁😂😃😄😅😆😇😈😉😊😋😌😍😎😏😐😑😒😓😔😕😖😗😘😙😚😛😜😝😞😟😠😡😢😣
😤😥😦😧😨😩😪😫😬😭😮😯😰😱😲😳😴😵😶😷😸😹😺😻😼😽😾😿🙀🙁🙂🙃🙄🙅🙆🙇
🙈🙉🙊🙋🙌🙍🙎🙏🤐🤑🤒🤓🤔🤕🤖🤗🤘🤙🤚🤛🤜🤝🤞🤟🤠🤡🤢🤣🤤🤥🤦🤧🤨🤩🤪🤫
🤬🤭🤮🤯

$ echo Pack my box with five dozen liquor jugs. | base256u
Pack my box with five dozen liquor jugs.º

$ echo Pack my box with five dozen liquor jugs. | base256u | base256u -d
Pack my box with five dozen liquor jugs.

$ echo Pack my box with five dozen liquor jugs. | base256u -e
🐀🐑🐓🐛🍐🐝🐩🍐🐒🐟🐨🍐🐧🐙🐤🐘🍐🐖🐙🐦🐕🍐🐔🐟🐪🐕🐞🍐🐜🐙🐡🐥🐟🐢🍐🐚🐥🐗🐣🍞🌺

$ echo Pack my box with five dozen liquor jugs. | base256u -e | base256u -de
Pack my box with five dozen liquor jugs.

$ dd if=/dev/urandom bs=36 count=1 2>/dev/null | base256u
B^$Éıŷ/āűũļşÎ%Ăż'żşAtiŮqHtÀŔwĸÂWŲŋŪķ

$ dd if=/dev/urandom bs=36 count=1 2>/dev/null | base256u -e
😵😟🌻🤖🍮🤢😗🐣😡🍦🐖🐾🍺🐊🌻🍽🐹🍳😣🐈🙌🐿🙎🍪🤙🤝🍫🐰🐮🙈😆🍗🍍🤨🐺🍗

The crate

You can find the documentation in the usual place.

Using this crate is as simple as use base256u::{Decode, Encode}; and then calling the base256u() method or base256u_papu() to get the default papu encoding.

use crate::{Decode, Encode};

#[test]
fn encoding() {
    let encoded: String = (u8::MIN..=u8::MAX).base256u_papu().collect();
    assert_eq!(encoded, "°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏ !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~§ĀāĂ㥹ĆćĈĉĊċČčĎďĐđĒēĔĕĖėĘęĚěĜĝĞğĠġĢģĤĥĦħĨĩĪīĬĭĮįİıIJijĴĵĶķĸĹĺĻļĽľĿŀŁłŃńŅņŇň¤ŊŋŌōŎŏŐőŒœŔŕŖŗŘřŚśŜŝŞşŠšŢţŤťŦŧŨũŪūŬŭŮůŰűŲųŴŵŶŷŸŹźŻżŽžſ");
    let encoded: String = b"Pack my box with five dozen liquor jugs."
        .into_iter()
        .copied()
        .base256u_papu()
        .collect();
    assert_eq!(encoded, "Pack my box with five dozen liquor jugs.");
}

#[test]
fn decoding() {
    let decoded: Vec<Option<u8>> = "°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏ !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~§ĀāĂ㥹ĆćĈĉĊċČčĎďĐđĒēĔĕĖėĘęĚěĜĝĞğĠġĢģĤĥĦħĨĩĪīĬĭĮįİıIJijĴĵĶķĸĹĺĻļĽľĿŀŁłŃńŅņŇň¤ŊŋŌōŎŏŐőŒœŔŕŖŗŘřŚśŜŝŞşŠšŢţŤťŦŧŨũŪūŬŭŮůŰűŲųŴŵŶŷŸŹźŻżŽžſƝʼn".chars().base256u_papu().collect();
    let mut matcher: Vec<Option<u8>> = (u8::MIN..=u8::MAX).map(|b| Some(b)).collect();
    matcher.push(None);
    matcher.push(None);
    assert_eq!(decoded, matcher);
    let decoded: Vec<u8> = "Pack my box with five dozen liquor jugs."
        .chars()
        .base256u_papu()
        .map(|c| c.unwrap())
        .collect();
    assert_eq!(
        String::from_utf8(decoded).unwrap(),
        "Pack my box with five dozen liquor jugs."
    );
}

No runtime deps