5 changed files with 62 additions and 482 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -103,12 +103,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "acbf1af155f9b9ef647e42cdc158db4b64a1b61f743629225fde6f3e0be2a7c7"

 [[package]]
-name = "enc-check"
-version = "0.1.1"
+name = "enc"
+version = "0.1.0"
 dependencies = [
 "clap",
- "serde",
- "serde_json",
 "tabled",
 ]

@ -124,12 +122,6 @@ version = "0.4.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8"

-[[package]]
-name = "itoa"
-version = "1.0.9"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "af150ab688ff2122fcef229be89cb50dd66af9e01a4ff320cc137eecc9bacc38"
-
 [[package]]
 name = "papergrid"
 version = "0.10.0"
@ -183,43 +175,6 @@ dependencies = [
 "proc-macro2",
 ]

-[[package]]
-name = "ryu"
-version = "1.0.15"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1ad4cc8da4ef723ed60bced201181d83791ad433213d8c24efffda1eec85d741"
-
-[[package]]
-name = "serde"
-version = "1.0.188"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cf9e0fcba69a370eed61bcf2b728575f726b50b55cba78064753d708ddc7549e"
-dependencies = [
- "serde_derive",
-]
-
-[[package]]
-name = "serde_derive"
-version = "1.0.188"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4eca7ac642d82aa35b60049a6eccb4be6be75e599bd2e9adb5f875a737654af2"
-dependencies = [
- "proc-macro2",
- "quote",
- "syn 2.0.38",
-]
-
-[[package]]
-name = "serde_json"
-version = "1.0.107"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6b420ce6e3d8bd882e9b243c6eed35dbc9a6110c9769e74b584e0d68d1f20c65"
-dependencies = [
- "itoa",
- "ryu",
- "serde",
-]
-
 [[package]]
 name = "strsim"
 version = "0.10.0"
--- a/Cargo.toml
+++ b/Cargo.toml
@ -1,16 +1,10 @@
 [package]
-name = "enc-check"
-version = "0.1.1"
+name = "enc"
+version = "0.1.0"
 edition = "2021"
-authors = ["cool-mist <n.suryap@gmail.com>"]
-license = "GPL-3.0-only"
-description = "A tool to inspect utf-8 and utf-16 character encodings"
-homepage = "https://github.com/cool-mist/enc"
-repository = "https://github.com/cool-mist/enc"
-keywords = ["encodings", "utf-8", "utf8", "unicode"]
+
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

 [dependencies]
 clap = { version = "4.4.6", features = ["derive"] }
-serde = { version = "1.0.188", features = ["derive"] }
-serde_json = "1.0.107"
 tabled = "0.14.0"
--- a/README.md
+++ b/README.md
@ -1,53 +0,0 @@
-# enc-check #
-
-## Installation ##
-
-`cargo install enc-check`
-
-## Usage ##
-
-Inspect character encodings.
-
-```
-enc-check --help
-
-Usage: enc-check [OPTIONS] <-8|-6> <NAME>
-
-Arguments:
-  <NAME>
-          The string to inspect
-
-Options:
-  -8
-          Inspect utf-8
-  -6
-          Inspect utf-16
-  -j, --json
-          Output as json. Useful as a command line tool
-  -h, --help
-          Print help
-```
-
-
-```
-enc-check -8 asdᚢ𐌰
-
-┌───────┬───────┬───────────┬──────┬─────┬─────┬──────────┐
-│ U+dec │ U+hex │ character │ byte │ hex │ dec │ bin      │
-├───────┼───────┼───────────┼──────┼─────┼─────┼──────────┤
-│ 97    │ 61    │ a         │ 0    │ 61  │ 97  │ 01100001 │
-│ 115   │ 73    │ s         │ 1    │ 73  │ 115 │ 01110011 │
-│ 100   │ 64    │ d         │ 2    │ 64  │ 100 │ 01100100 │
-│ 5794  │ 16a2  │ ᚢ         │ 3    │ e1  │ 225 │ 11100001 │
-│       │       │           │ 4    │ 9a  │ 154 │ 10011010 │
-│       │       │           │ 5    │ a2  │ 162 │ 10100010 │
-│ 66352 │ 10330 │ 𐌰         │ 6    │ f0  │ 240 │ 11110000 │
-│       │       │           │ 7    │ 90  │ 144 │ 10010000 │
-│       │       │           │ 8    │ 8c  │ 140 │ 10001100 │
-│       │       │           │ 9    │ b0  │ 176 │ 10110000 │
-└───────┴───────┴───────────┴──────┴─────┴─────┴──────────┘
-```
-
-## Learn ##
-
- Summary of how encodings work is at `learn/slides.md`. Run it using [maaslalani/slides](https://github.com/maaslalani/slides).
--- a/learn/slides.md
+++ b/learn/slides.md
@ -1,223 +0,0 @@
-# Characters over the wire #
-
-Standards on sending, and parsing characters over the web.
-
-## Basic idea ##
-
- **Assign** a number to each character using a Character set.
- **Encode** the number to bytes using an encoding scheme.
- Transfer bytes over the internet
-
-The terms character set is used interchangably with character encoding and code pages.
-
---
-
-# Common character sets #
-
-## ASCII ##
-
- It assigns character to number mapping from 0-127 and covers english characters and some control codes (eg: new lines, tabs)
- Not everything from 0-127 is mapped.
-
-## Latin ##
-
- Also called ISO-8859-1 character set.
- This is an extension of ASCII and covers the Latin alphabet - À,ä...
- Number mappings upto 255.
-
-## Windows 1252 ##
-
- Super set of Latin character set.
- Introduced by Microsoft.
-
-## Unicode ##
-
- Capable of defining a mapping for 1.1 million characters.
- Currently 150000 are defined.
- Each mapping is also called a unicode code point.
- Most languages - ஐ, ह
- Emojis 😮, 🤔
- Math ∫x.dx
-
---
-
-# Common encoding schemes #
-
- An encoding scheme will encode the number to one or more bytes.
-
-## Single byte encoding schemes ##
-
- Uses up only one byte.
- Suitable for ASCII, Latin and Windows 1252 character sets.
- ASCII would only take up 7 bits, while Latin and Windows 1252 would take up 8 bits.
- Because Windows 1252 is a superset of Latin, which is also a super set of ASCII, for a very long time in the past, the most used encoding scheme was Windows 1252.
- Today, it only accounts for 1.4% of the internet traffic.
-
-```
-┌───────┬───────┬───────────┬──────┬─────┬─────┬──────────┐
-│ U+dec │ U+hex │ character │ byte │ hex │ dec │ bin      │
-├───────┼───────┼───────────┼──────┼─────┼─────┼──────────┤
-│ 97    │ 61    │ a         │ 0    │ 61  │ 97  │ 01100001 │
-│ 98    │ 62    │ b         │ 1    │ 62  │ 98  │ 01100010 │
-│ 99    │ 63    │ c         │ 2    │ 63  │ 99  │ 01100011 │
-│ 100   │ 64    │ d         │ 3    │ 64  │ 100 │ 01100100 │
-└───────┴───────┴───────────┴──────┴─────┴─────┴──────────┘
-```
-
---
-
-# Common encoding schemes #
-
- *An encoding scheme will encode the number to one or more bytes.
-
-## Multi byte encoding schemes ##
-
-### UTF - 8 ###
-
- Variable byte encoding scheme.
- 1 - 4 bytes to represent a unicode code point.
- Backward compatible with ASCII.
- Can represent a maximum number of 2097152 code points.
- 99% of the internet uses this encoding scheme.
-
-
- | Byte 1   | Byte 2   | Byte 3   | Byte 4   | Available bits |
- |----------|----------|----------|----------|----------------|
- | 0xxxxxxx | -        | -        | -        | 7              |
- | 110xxxxx | 10xxxxxx | -        | -        | 11             |
- | 1110xxxx | 10xxxxxx | 10xxxxxx | -        | 16             |
- | 11110xxx | 10xxxxxx | 10xxxxxx | 10xxxxxx | 21             |
-
-```
-┌────────┬───────┬───────────┬──────┬─────┬─────┬──────────┐
-│ U+dec  │ U+hex │ character │ byte │ hex │ dec │ bin      │
-├────────┼───────┼───────────┼──────┼─────┼─────┼──────────┤
-│ 97     │ 61    │ a         │ 0    │ 61  │ 97  │ 01100001 │
-│ 98     │ 62    │ b         │ 1    │ 62  │ 98  │ 01100010 │
-│ 2960   │ b90   │ ஐ         │ 2    │ e0  │ 224 │ 11100000 │
-│        │       │           │ 3    │ ae  │ 174 │ 10101110 │
-│        │       │           │ 4    │ 90  │ 144 │ 10010000 │
-│ 2361   │ 939   │ ह         │ 5    │ e0  │ 224 │ 11100000 │
-│        │       │           │ 6    │ a4  │ 164 │ 10100100 │
-│        │       │           │ 7    │ b9  │ 185 │ 10111001 │
-│ 129300 │ 1f914 │ 🤔        │ 8    │ f0  │ 240 │ 11110000 │
-│        │       │           │ 9    │ 9f  │ 159 │ 10011111 │
-│        │       │           │ 10   │ a4  │ 164 │ 10100100 │
-│        │       │           │ 11   │ 94  │ 148 │ 10010100 │
-└────────┴───────┴───────────┴──────┴─────┴─────┴──────────┘
-```
-
---
-
-# Common encoding schemes #
-
- An encoding scheme will encode the number to one or more bytes.
-
-## Multi byte encoding schemes ##
-
-### UTF - 16 ###
-
- Variable byte encoding scheme.
- 2 or 4 bytes to represent a unicode code point.
-
-```
-┌────────┬───────┬───────────┬──────┬─────┬─────┬──────────┐
-│ U+dec  │ U+hex │ character │ byte │ hex │ dec │ bin      │
-├────────┼───────┼───────────┼──────┼─────┼─────┼──────────┤
-│ 97     │ 61    │ a         │ 0    │ 00  │ 0   │ 00000000 │
-│        │       │           │ 1    │ 61  │ 97  │ 01100001 │
-│ 98     │ 62    │ b         │ 2    │ 00  │ 0   │ 00000000 │
-│        │       │           │ 3    │ 62  │ 98  │ 01100010 │
-│ 2960   │ b90   │ ஐ         │ 4    │ 0b  │ 11  │ 00001011 │
-│        │       │           │ 5    │ 90  │ 144 │ 10010000 │
-│ 2361   │ 939   │ ह         │ 6    │ 09  │ 9   │ 00001001 │
-│        │       │           │ 7    │ 39  │ 57  │ 00111001 │
-│ 129300 │ 1f914 │ 🤔        │ 8    │ d8  │ 216 │ 11011000 │
-│        │       │           │ 9    │ 3e  │ 62  │ 00111110 │
-│        │       │           │ 10   │ dd  │ 221 │ 11011101 │
-│        │       │           │ 11   │ 14  │ 20  │ 00010100 │
-└────────┴───────┴───────────┴──────┴─────┴─────┴──────────┘
-```
-
---
-
-# URL Encoding #
-
- Applicable only for HTTP traffic.
- Some characters have a special meaning in the url string Eg: &, #, ?
- The url string should also be only in ASCII.
- These characters should be treated differently.
-
-## Steps to URL-encode a string ##
-
- Encode the string in one of the encoding schemes.
- If a particular character cannot appear in the url string, or is not ASCII, print the hex representation of the string, prefixed with a `%`.
-
-```
-┌───────┬───────┬───────────┬──────┬─────┬─────┬──────────┐
-│ U+dec │ U+hex │ character │ byte │ hex │ dec │ bin      │
-├───────┼───────┼───────────┼──────┼─────┼─────┼──────────┤
-│ 38    │ 26    │ &         │ 0    │ 26  │ 38  │ 00100110 │
-│ 63    │ 3f    │ ?         │ 1    │ 3f  │ 63  │ 00111111 │
-└───────┴───────┴───────────┴──────┴─────┴─────┴──────────┘
-```
-
- For example, if the url string `p1&/pw?` were to be url-encoded under utf-8 encoding, then it would be `p1%26/pw%3f`
-
-```
-┌───────┬───────┬───────────┬──────┬─────┬─────┬──────────┐
-│ U+dec │ U+hex │ character │ byte │ hex │ dec │ bin      │
-├───────┼───────┼───────────┼──────┼─────┼─────┼──────────┤
-│ 38    │ 26    │ &         │ 0    │ 00  │ 0   │ 00000000 │
-│       │       │           │ 1    │ 26  │ 38  │ 00100110 │
-│ 63    │ 3f    │ ?         │ 2    │ 00  │ 0   │ 00000000 │
-│       │       │           │ 3    │ 3f  │ 63  │ 00111111 │
-└───────┴───────┴───────────┴──────┴─────┴─────┴──────────┘
-```
-
- Under utf-16 encoding, it would be `p1%00%26/pw%00%3f`
-
---
-
-# What should be supported in applications? #
-
- Support Unicode code points encoded as utf-8 characters.
- URL encode under utf-8.
-
---
-
-# What is a character? #
-
- It is a group of unicode code points - also called a grapheme cluster.
- Eg: the character 'ப்' consists of 2 unicode code points as seen below.
-
-```
-┌───────┬───────┬───────────┬──────┬─────┬─────┬──────────┐
-│ U+dec │ U+hex │ character │ byte │ hex │ dec │ bin      │
-├───────┼───────┼───────────┼──────┼─────┼─────┼──────────┤
-│ 2986  │ baa   │ ப         │ 0    │ e0  │ 224 │ 11100000 │
-│       │       │           │ 1    │ ae  │ 174 │ 10101110 │
-│       │       │           │ 2    │ aa  │ 170 │ 10101010 │
-│ 3021  │ bcd   │ ்         | 3    │ e0  │ 224 │ 11100000 │
-│       │       │           │ 4    │ af  │ 175 │ 10101111 │
-│       │       │           │ 5    │ 8d  │ 141 │ 10001101 │
-└───────┴───────┴───────────┴──────┴─────┴─────┴──────────┘
-```
-
- Number of characters in a string is often different from `string.Length`.
- Some languages (eg: python) return the number of unicode code points.
- Some languages (eg: C#) will return the number of utf-16 bytes to encode the complete string.
- The below emoji is of length 1 in python and length 4 in c#.
-
-```
-┌────────┬───────┬───────────┬──────┬─────┬─────┬──────────┐
-│ U+dec  │ U+hex │ character │ byte │ hex │ dec │ bin      │
-├────────┼───────┼───────────┼──────┼─────┼─────┼──────────┤
-│ 129300 │ 1f914 │ 🤔        │ 0    │ d8  │ 216 │ 11011000 │
-│        │       │           │ 1    │ 3e  │ 62  │ 00111110 │
-│        │       │           │ 2    │ dd  │ 221 │ 11011101 │
-│        │       │           │ 3    │ 14  │ 20  │ 00010100 │
-└────────┴───────┴───────────┴──────┴─────┴─────┴──────────┘
-```
-
- Be careful about advertising character length limitations.
--- a/src/main.rs
+++ b/src/main.rs
@ -1,13 +1,21 @@
-use clap::{Args, Parser};
-use serde::Serialize;
+use clap::Parser;
+
 use tabled::{
    builder::Builder,
-    settings::{object::Rows, Alignment, Modify, Style},
+    settings::{Modify, object::Rows, Alignment, Style}
 };

+#[derive(Parser)]
+struct CliArgs {
+    name: String,
+
+    #[arg(short = 'j', long = "json", action)]
+    json: bool,
+}
+
 struct StringDetail {
    characters: Vec<CharacterDetail>,
-    length: usize,
+    len: usize,
 }

 struct CharacterDetail {
@ -17,30 +25,15 @@ struct CharacterDetail {
 }

 impl StringDetail{
-    fn parse_utf8(query: &str) -> Self {
+    fn parse_utf8(query: &String) -> Self {
        let mut details:StringDetail = StringDetail::default();
        for i in query.chars() {
            let mut bytes = [0; 4];
-            let encoded = i.encode_utf8(&mut bytes);
+            i.encode_utf8(&mut bytes);

-            let mut citer = vec![i].into_iter();
-            for b in encoded.bytes() {
-                details.push(citer.next(), b);
-            }
-        }
-
-        details
-    }
-
-    fn parse_utf16(query: &String) -> Self {
-        let mut details: StringDetail = StringDetail::default();
-        for i in query.chars() {
-            let mut bytes = [0; 2];
-            let encoded = i.encode_utf16(&mut bytes);
-
-            let mut citer = vec![i].into_iter();
-            for b in encoded {
-                details.push_utf16(citer.next(), *b);
+            details.push(Some(i), bytes[0]);
+            for b in 1..i.len_utf8() {
+                details.push(None, bytes[b]);
            }
        }

@ -48,48 +41,40 @@ impl StringDetail {
    }

    fn default() -> Self {
-        Self {
-            characters: Vec::new(),
-            length: 0,
-        }
+        Self { characters: Vec::new(), len: 0 }
    }

-    fn push(&mut self, character: Option<char>, byte: u8) {
-        self.characters.push(CharacterDetail {
-            byte_index: self.length,
+    fn push(&mut self, character:Option<char>, byte:u8){ self.characters
+            .push(CharacterDetail {
+                byte_index: self.len,
                character,
                byte,
            });
-        self.length += 1;
+        self.len += 1;
    }

-    fn push_utf16(&mut self, character: Option<char>, byte: u16) {
-        let bytes = byte.to_be_bytes();
-        self.push(character, bytes[0]);
-        self.push(None, bytes[1]);
-    }
+    fn print_table(&self) {
+        let mut table_builder = Builder::default();
+        table_builder.set_header(StringDetail::table_header());
+        for i in self.table_rows() {
+            table_builder.push_record(i);
        }

-#[derive(Serialize)]
-struct StringTable {
-    characters: Vec<StringTableRow>,
-    length: usize,
+        let table = table_builder.build()
+            .with(Style::sharp())
+            .with(Modify::new(Rows::new(1..)).with(Alignment::left()))
+            .to_string();
+        print!("{}", table);
    }

-#[derive(Serialize)]
-struct StringTableRow {
-    unicode: String,
-    unicode_hex: String,
-    character: String,
-    byte: String,
-    hex: String,
-    dec: String,
-    bin: String,
+    fn table_rows(&self) -> Vec<Vec<String>> {
+        self.characters.iter()
+                .map(StringDetail::to_table_row)
+                .collect::<Vec<Vec<_>>>()
    }

-impl StringTableRow {
-    fn from(char_detail: &CharacterDetail) -> Self {
-        let empty = "";
+    fn to_table_row(char_detail: &CharacterDetail) -> Vec<String> {
+        let empty = "<->";
        let mut character = String::from(empty);
        let mut unicode = String::from(empty);
        let mut unicode_hex = String::from(empty);
@ -99,25 +84,25 @@ impl StringTableRow {
                unicode = String::from(format!("{}", x as u32));
                unicode_hex = String::from(format!("{:x}", x as u32));
            }
-            None => {}
+            None => {
+            }
        };
        let byte = format!("{}", char_detail.byte_index);
        let hex = format!("{:02x}", char_detail.byte);
        let dec = format!("{}", char_detail.byte);
        let bin = format!("{:08b}", char_detail.byte);

-        StringTableRow {
+        vec![
            unicode,
            unicode_hex,
            character,
            byte,
            hex,
            dec,
-            bin,
-        }
+            bin]
    }

-    fn header() -> Vec<String> {
+    fn table_header() -> Vec<String> {
        vec![
            String::from("U+dec"),
            String::from("U+hex"),
@ -129,91 +114,13 @@ impl StringTableRow {
        ]
    }

-    fn to_table_row(self) -> Vec<String> {
-        vec![
-            self.unicode,
-            self.unicode_hex,
-            self.character,
-            self.byte,
-            self.hex,
-            self.dec,
-            self.bin,
-        ]
-    }
-}
-
-impl StringTable {
-    fn from(string_details: &StringDetail) -> Self {
-        let characters = string_details
-            .characters
-            .iter()
-            .map(StringTableRow::from)
-            .collect::<Vec<StringTableRow>>();
-
-        StringTable {
-            characters,
-            length: string_details.length,
-        }
-    }
-
-    fn as_table(self) -> String {
-        let mut table_builder = Builder::default();
-        table_builder.set_header(StringTableRow::header());
-        for i in self.characters {
-            table_builder.push_record(i.to_table_row());
-        }
-
-        let table = table_builder
-            .build()
-            .with(Style::sharp())
-            .with(Modify::new(Rows::new(1..)).with(Alignment::left()))
-            .to_string();
-
-        format!("{}", table)
-    }
-
-    fn as_json(self) -> String {
-        format!("{}", serde_json::to_string(&self).unwrap())
-    }
-}
-
-#[derive(Parser)]
-#[command(next_line_help = true)]
-struct CliArgs {
-    /// The string to inspect
-    name: String,
-
-    #[command(flatten)]
-    inspect: InspectArgs,
-
-    /// Output as json. Useful as a command line tool
-    #[arg(short = 'j', long = "json", action)]
-    json: bool,
-}
-
-#[derive(Args)]
-#[group(required = true, multiple = false)]
-struct InspectArgs {
-    /// Inspect utf-8
-    #[arg(short = '8', action)]
-    utf8: bool,
-
-    /// Inspect utf-16
-    #[arg(short = '6', action)]
-    utf16: bool,
 }

 fn main() {
    let cli = CliArgs::parse();
-    let details = match cli.inspect.utf8 {
-        true => StringDetail::parse_utf8(&cli.name),
-        false => StringDetail::parse_utf16(&cli.name),
-    };
-
-    let char_table = StringTable::from(&details);
-
+    let utf8 = StringDetail::parse_utf8(&cli.name);
    match cli.json {
-        false => println!("{}", char_table.as_table()),
-        true => println!("{}", char_table.as_json()),
+        false => utf8.print_table(),
+        _ => panic!("Not yet implemented!!"),
    }
 }