Compare commits
No commits in common. "44ae93fc0684c1b24785716c9f5d00b5910e6573" and "1da9d1ac68347cda9d5725aae2f7bde94ee792f5" have entirely different histories.
44ae93fc06
...
1da9d1ac68
49
Cargo.lock
generated
49
Cargo.lock
generated
@ -103,12 +103,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
|||||||
checksum = "acbf1af155f9b9ef647e42cdc158db4b64a1b61f743629225fde6f3e0be2a7c7"
|
checksum = "acbf1af155f9b9ef647e42cdc158db4b64a1b61f743629225fde6f3e0be2a7c7"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "enc-check"
|
name = "enc"
|
||||||
version = "0.1.1"
|
version = "0.1.0"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"clap",
|
"clap",
|
||||||
"serde",
|
|
||||||
"serde_json",
|
|
||||||
"tabled",
|
"tabled",
|
||||||
]
|
]
|
||||||
|
|
||||||
@ -124,12 +122,6 @@ version = "0.4.1"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8"
|
checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8"
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "itoa"
|
|
||||||
version = "1.0.9"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "af150ab688ff2122fcef229be89cb50dd66af9e01a4ff320cc137eecc9bacc38"
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "papergrid"
|
name = "papergrid"
|
||||||
version = "0.10.0"
|
version = "0.10.0"
|
||||||
@ -183,43 +175,6 @@ dependencies = [
|
|||||||
"proc-macro2",
|
"proc-macro2",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "ryu"
|
|
||||||
version = "1.0.15"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "1ad4cc8da4ef723ed60bced201181d83791ad433213d8c24efffda1eec85d741"
|
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "serde"
|
|
||||||
version = "1.0.188"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "cf9e0fcba69a370eed61bcf2b728575f726b50b55cba78064753d708ddc7549e"
|
|
||||||
dependencies = [
|
|
||||||
"serde_derive",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "serde_derive"
|
|
||||||
version = "1.0.188"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "4eca7ac642d82aa35b60049a6eccb4be6be75e599bd2e9adb5f875a737654af2"
|
|
||||||
dependencies = [
|
|
||||||
"proc-macro2",
|
|
||||||
"quote",
|
|
||||||
"syn 2.0.38",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "serde_json"
|
|
||||||
version = "1.0.107"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "6b420ce6e3d8bd882e9b243c6eed35dbc9a6110c9769e74b584e0d68d1f20c65"
|
|
||||||
dependencies = [
|
|
||||||
"itoa",
|
|
||||||
"ryu",
|
|
||||||
"serde",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "strsim"
|
name = "strsim"
|
||||||
version = "0.10.0"
|
version = "0.10.0"
|
||||||
|
|||||||
14
Cargo.toml
14
Cargo.toml
@ -1,16 +1,10 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "enc-check"
|
name = "enc"
|
||||||
version = "0.1.1"
|
version = "0.1.0"
|
||||||
edition = "2021"
|
edition = "2021"
|
||||||
authors = ["cool-mist <n.suryap@gmail.com>"]
|
|
||||||
license = "GPL-3.0-only"
|
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||||
description = "A tool to inspect utf-8 and utf-16 character encodings"
|
|
||||||
homepage = "https://github.com/cool-mist/enc"
|
|
||||||
repository = "https://github.com/cool-mist/enc"
|
|
||||||
keywords = ["encodings", "utf-8", "utf8", "unicode"]
|
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
clap = { version = "4.4.6", features = ["derive"] }
|
clap = { version = "4.4.6", features = ["derive"] }
|
||||||
serde = { version = "1.0.188", features = ["derive"] }
|
|
||||||
serde_json = "1.0.107"
|
|
||||||
tabled = "0.14.0"
|
tabled = "0.14.0"
|
||||||
|
|||||||
53
README.md
53
README.md
@ -1,53 +0,0 @@
|
|||||||
# enc-check #
|
|
||||||
|
|
||||||
## Installation ##
|
|
||||||
|
|
||||||
`cargo install enc-check`
|
|
||||||
|
|
||||||
## Usage ##
|
|
||||||
|
|
||||||
Inspect character encodings.
|
|
||||||
|
|
||||||
```
|
|
||||||
enc-check --help
|
|
||||||
|
|
||||||
Usage: enc-check [OPTIONS] <-8|-6> <NAME>
|
|
||||||
|
|
||||||
Arguments:
|
|
||||||
<NAME>
|
|
||||||
The string to inspect
|
|
||||||
|
|
||||||
Options:
|
|
||||||
-8
|
|
||||||
Inspect utf-8
|
|
||||||
-6
|
|
||||||
Inspect utf-16
|
|
||||||
-j, --json
|
|
||||||
Output as json. Useful as a command line tool
|
|
||||||
-h, --help
|
|
||||||
Print help
|
|
||||||
```
|
|
||||||
|
|
||||||
|
|
||||||
```
|
|
||||||
enc-check -8 asdᚢ𐌰
|
|
||||||
|
|
||||||
┌───────┬───────┬───────────┬──────┬─────┬─────┬──────────┐
|
|
||||||
│ U+dec │ U+hex │ character │ byte │ hex │ dec │ bin │
|
|
||||||
├───────┼───────┼───────────┼──────┼─────┼─────┼──────────┤
|
|
||||||
│ 97 │ 61 │ a │ 0 │ 61 │ 97 │ 01100001 │
|
|
||||||
│ 115 │ 73 │ s │ 1 │ 73 │ 115 │ 01110011 │
|
|
||||||
│ 100 │ 64 │ d │ 2 │ 64 │ 100 │ 01100100 │
|
|
||||||
│ 5794 │ 16a2 │ ᚢ │ 3 │ e1 │ 225 │ 11100001 │
|
|
||||||
│ │ │ │ 4 │ 9a │ 154 │ 10011010 │
|
|
||||||
│ │ │ │ 5 │ a2 │ 162 │ 10100010 │
|
|
||||||
│ 66352 │ 10330 │ 𐌰 │ 6 │ f0 │ 240 │ 11110000 │
|
|
||||||
│ │ │ │ 7 │ 90 │ 144 │ 10010000 │
|
|
||||||
│ │ │ │ 8 │ 8c │ 140 │ 10001100 │
|
|
||||||
│ │ │ │ 9 │ b0 │ 176 │ 10110000 │
|
|
||||||
└───────┴───────┴───────────┴──────┴─────┴─────┴──────────┘
|
|
||||||
```
|
|
||||||
|
|
||||||
## Learn ##
|
|
||||||
|
|
||||||
- Summary of how encodings work is at `learn/slides.md`. Run it using [maaslalani/slides](https://github.com/maaslalani/slides).
|
|
||||||
223
learn/slides.md
223
learn/slides.md
@ -1,223 +0,0 @@
|
|||||||
# Characters over the wire #
|
|
||||||
|
|
||||||
Standards on sending, and parsing characters over the web.
|
|
||||||
|
|
||||||
## Basic idea ##
|
|
||||||
|
|
||||||
- **Assign** a number to each character using a Character set.
|
|
||||||
- **Encode** the number to bytes using an encoding scheme.
|
|
||||||
- Transfer bytes over the internet
|
|
||||||
|
|
||||||
The terms character set is used interchangably with character encoding and code pages.
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
# Common character sets #
|
|
||||||
|
|
||||||
## ASCII ##
|
|
||||||
|
|
||||||
- It assigns character to number mapping from 0-127 and covers english characters and some control codes (eg: new lines, tabs)
|
|
||||||
- Not everything from 0-127 is mapped.
|
|
||||||
|
|
||||||
## Latin ##
|
|
||||||
|
|
||||||
- Also called ISO-8859-1 character set.
|
|
||||||
- This is an extension of ASCII and covers the Latin alphabet - À,ä...
|
|
||||||
- Number mappings upto 255.
|
|
||||||
|
|
||||||
## Windows 1252 ##
|
|
||||||
|
|
||||||
- Super set of Latin character set.
|
|
||||||
- Introduced by Microsoft.
|
|
||||||
|
|
||||||
## Unicode ##
|
|
||||||
|
|
||||||
- Capable of defining a mapping for 1.1 million characters.
|
|
||||||
- Currently 150000 are defined.
|
|
||||||
- Each mapping is also called a unicode code point.
|
|
||||||
- Most languages - ஐ, ह
|
|
||||||
- Emojis 😮, 🤔
|
|
||||||
- Math ∫x.dx
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
# Common encoding schemes #
|
|
||||||
|
|
||||||
- An encoding scheme will encode the number to one or more bytes.
|
|
||||||
|
|
||||||
## Single byte encoding schemes ##
|
|
||||||
|
|
||||||
- Uses up only one byte.
|
|
||||||
- Suitable for ASCII, Latin and Windows 1252 character sets.
|
|
||||||
- ASCII would only take up 7 bits, while Latin and Windows 1252 would take up 8 bits.
|
|
||||||
- Because Windows 1252 is a superset of Latin, which is also a super set of ASCII, for a very long time in the past, the most used encoding scheme was Windows 1252.
|
|
||||||
- Today, it only accounts for 1.4% of the internet traffic.
|
|
||||||
|
|
||||||
```
|
|
||||||
┌───────┬───────┬───────────┬──────┬─────┬─────┬──────────┐
|
|
||||||
│ U+dec │ U+hex │ character │ byte │ hex │ dec │ bin │
|
|
||||||
├───────┼───────┼───────────┼──────┼─────┼─────┼──────────┤
|
|
||||||
│ 97 │ 61 │ a │ 0 │ 61 │ 97 │ 01100001 │
|
|
||||||
│ 98 │ 62 │ b │ 1 │ 62 │ 98 │ 01100010 │
|
|
||||||
│ 99 │ 63 │ c │ 2 │ 63 │ 99 │ 01100011 │
|
|
||||||
│ 100 │ 64 │ d │ 3 │ 64 │ 100 │ 01100100 │
|
|
||||||
└───────┴───────┴───────────┴──────┴─────┴─────┴──────────┘
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
# Common encoding schemes #
|
|
||||||
|
|
||||||
- *An encoding scheme will encode the number to one or more bytes.
|
|
||||||
|
|
||||||
## Multi byte encoding schemes ##
|
|
||||||
|
|
||||||
### UTF - 8 ###
|
|
||||||
|
|
||||||
- Variable byte encoding scheme.
|
|
||||||
- 1 - 4 bytes to represent a unicode code point.
|
|
||||||
- Backward compatible with ASCII.
|
|
||||||
- Can represent a maximum number of 2097152 code points.
|
|
||||||
- 99% of the internet uses this encoding scheme.
|
|
||||||
|
|
||||||
|
|
||||||
| Byte 1 | Byte 2 | Byte 3 | Byte 4 | Available bits |
|
|
||||||
|----------|----------|----------|----------|----------------|
|
|
||||||
| 0xxxxxxx | - | - | - | 7 |
|
|
||||||
| 110xxxxx | 10xxxxxx | - | - | 11 |
|
|
||||||
| 1110xxxx | 10xxxxxx | 10xxxxxx | - | 16 |
|
|
||||||
| 11110xxx | 10xxxxxx | 10xxxxxx | 10xxxxxx | 21 |
|
|
||||||
|
|
||||||
```
|
|
||||||
┌────────┬───────┬───────────┬──────┬─────┬─────┬──────────┐
|
|
||||||
│ U+dec │ U+hex │ character │ byte │ hex │ dec │ bin │
|
|
||||||
├────────┼───────┼───────────┼──────┼─────┼─────┼──────────┤
|
|
||||||
│ 97 │ 61 │ a │ 0 │ 61 │ 97 │ 01100001 │
|
|
||||||
│ 98 │ 62 │ b │ 1 │ 62 │ 98 │ 01100010 │
|
|
||||||
│ 2960 │ b90 │ ஐ │ 2 │ e0 │ 224 │ 11100000 │
|
|
||||||
│ │ │ │ 3 │ ae │ 174 │ 10101110 │
|
|
||||||
│ │ │ │ 4 │ 90 │ 144 │ 10010000 │
|
|
||||||
│ 2361 │ 939 │ ह │ 5 │ e0 │ 224 │ 11100000 │
|
|
||||||
│ │ │ │ 6 │ a4 │ 164 │ 10100100 │
|
|
||||||
│ │ │ │ 7 │ b9 │ 185 │ 10111001 │
|
|
||||||
│ 129300 │ 1f914 │ 🤔 │ 8 │ f0 │ 240 │ 11110000 │
|
|
||||||
│ │ │ │ 9 │ 9f │ 159 │ 10011111 │
|
|
||||||
│ │ │ │ 10 │ a4 │ 164 │ 10100100 │
|
|
||||||
│ │ │ │ 11 │ 94 │ 148 │ 10010100 │
|
|
||||||
└────────┴───────┴───────────┴──────┴─────┴─────┴──────────┘
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
# Common encoding schemes #
|
|
||||||
|
|
||||||
- An encoding scheme will encode the number to one or more bytes.
|
|
||||||
|
|
||||||
## Multi byte encoding schemes ##
|
|
||||||
|
|
||||||
### UTF - 16 ###
|
|
||||||
|
|
||||||
- Variable byte encoding scheme.
|
|
||||||
- 2 or 4 bytes to represent a unicode code point.
|
|
||||||
|
|
||||||
```
|
|
||||||
┌────────┬───────┬───────────┬──────┬─────┬─────┬──────────┐
|
|
||||||
│ U+dec │ U+hex │ character │ byte │ hex │ dec │ bin │
|
|
||||||
├────────┼───────┼───────────┼──────┼─────┼─────┼──────────┤
|
|
||||||
│ 97 │ 61 │ a │ 0 │ 00 │ 0 │ 00000000 │
|
|
||||||
│ │ │ │ 1 │ 61 │ 97 │ 01100001 │
|
|
||||||
│ 98 │ 62 │ b │ 2 │ 00 │ 0 │ 00000000 │
|
|
||||||
│ │ │ │ 3 │ 62 │ 98 │ 01100010 │
|
|
||||||
│ 2960 │ b90 │ ஐ │ 4 │ 0b │ 11 │ 00001011 │
|
|
||||||
│ │ │ │ 5 │ 90 │ 144 │ 10010000 │
|
|
||||||
│ 2361 │ 939 │ ह │ 6 │ 09 │ 9 │ 00001001 │
|
|
||||||
│ │ │ │ 7 │ 39 │ 57 │ 00111001 │
|
|
||||||
│ 129300 │ 1f914 │ 🤔 │ 8 │ d8 │ 216 │ 11011000 │
|
|
||||||
│ │ │ │ 9 │ 3e │ 62 │ 00111110 │
|
|
||||||
│ │ │ │ 10 │ dd │ 221 │ 11011101 │
|
|
||||||
│ │ │ │ 11 │ 14 │ 20 │ 00010100 │
|
|
||||||
└────────┴───────┴───────────┴──────┴─────┴─────┴──────────┘
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
# URL Encoding #
|
|
||||||
|
|
||||||
- Applicable only for HTTP traffic.
|
|
||||||
- Some characters have a special meaning in the url string Eg: &, #, ?
|
|
||||||
- The url string should also be only in ASCII.
|
|
||||||
- These characters should be treated differently.
|
|
||||||
|
|
||||||
## Steps to URL-encode a string ##
|
|
||||||
|
|
||||||
- Encode the string in one of the encoding schemes.
|
|
||||||
- If a particular character cannot appear in the url string, or is not ASCII, print the hex representation of the string, prefixed with a `%`.
|
|
||||||
|
|
||||||
```
|
|
||||||
┌───────┬───────┬───────────┬──────┬─────┬─────┬──────────┐
|
|
||||||
│ U+dec │ U+hex │ character │ byte │ hex │ dec │ bin │
|
|
||||||
├───────┼───────┼───────────┼──────┼─────┼─────┼──────────┤
|
|
||||||
│ 38 │ 26 │ & │ 0 │ 26 │ 38 │ 00100110 │
|
|
||||||
│ 63 │ 3f │ ? │ 1 │ 3f │ 63 │ 00111111 │
|
|
||||||
└───────┴───────┴───────────┴──────┴─────┴─────┴──────────┘
|
|
||||||
```
|
|
||||||
|
|
||||||
- For example, if the url string `p1&/pw?` were to be url-encoded under utf-8 encoding, then it would be `p1%26/pw%3f`
|
|
||||||
|
|
||||||
```
|
|
||||||
┌───────┬───────┬───────────┬──────┬─────┬─────┬──────────┐
|
|
||||||
│ U+dec │ U+hex │ character │ byte │ hex │ dec │ bin │
|
|
||||||
├───────┼───────┼───────────┼──────┼─────┼─────┼──────────┤
|
|
||||||
│ 38 │ 26 │ & │ 0 │ 00 │ 0 │ 00000000 │
|
|
||||||
│ │ │ │ 1 │ 26 │ 38 │ 00100110 │
|
|
||||||
│ 63 │ 3f │ ? │ 2 │ 00 │ 0 │ 00000000 │
|
|
||||||
│ │ │ │ 3 │ 3f │ 63 │ 00111111 │
|
|
||||||
└───────┴───────┴───────────┴──────┴─────┴─────┴──────────┘
|
|
||||||
```
|
|
||||||
|
|
||||||
- Under utf-16 encoding, it would be `p1%00%26/pw%00%3f`
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
# What should be supported in applications? #
|
|
||||||
|
|
||||||
- Support Unicode code points encoded as utf-8 characters.
|
|
||||||
- URL encode under utf-8.
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
# What is a character? #
|
|
||||||
|
|
||||||
- It is a group of unicode code points - also called a grapheme cluster.
|
|
||||||
- Eg: the character 'ப்' consists of 2 unicode code points as seen below.
|
|
||||||
|
|
||||||
```
|
|
||||||
┌───────┬───────┬───────────┬──────┬─────┬─────┬──────────┐
|
|
||||||
│ U+dec │ U+hex │ character │ byte │ hex │ dec │ bin │
|
|
||||||
├───────┼───────┼───────────┼──────┼─────┼─────┼──────────┤
|
|
||||||
│ 2986 │ baa │ ப │ 0 │ e0 │ 224 │ 11100000 │
|
|
||||||
│ │ │ │ 1 │ ae │ 174 │ 10101110 │
|
|
||||||
│ │ │ │ 2 │ aa │ 170 │ 10101010 │
|
|
||||||
│ 3021 │ bcd │ ் | 3 │ e0 │ 224 │ 11100000 │
|
|
||||||
│ │ │ │ 4 │ af │ 175 │ 10101111 │
|
|
||||||
│ │ │ │ 5 │ 8d │ 141 │ 10001101 │
|
|
||||||
└───────┴───────┴───────────┴──────┴─────┴─────┴──────────┘
|
|
||||||
```
|
|
||||||
|
|
||||||
- Number of characters in a string is often different from `string.Length`.
|
|
||||||
- Some languages (eg: python) return the number of unicode code points.
|
|
||||||
- Some languages (eg: C#) will return the number of utf-16 bytes to encode the complete string.
|
|
||||||
- The below emoji is of length 1 in python and length 4 in c#.
|
|
||||||
|
|
||||||
```
|
|
||||||
┌────────┬───────┬───────────┬──────┬─────┬─────┬──────────┐
|
|
||||||
│ U+dec │ U+hex │ character │ byte │ hex │ dec │ bin │
|
|
||||||
├────────┼───────┼───────────┼──────┼─────┼─────┼──────────┤
|
|
||||||
│ 129300 │ 1f914 │ 🤔 │ 0 │ d8 │ 216 │ 11011000 │
|
|
||||||
│ │ │ │ 1 │ 3e │ 62 │ 00111110 │
|
|
||||||
│ │ │ │ 2 │ dd │ 221 │ 11011101 │
|
|
||||||
│ │ │ │ 3 │ 14 │ 20 │ 00010100 │
|
|
||||||
└────────┴───────┴───────────┴──────┴─────┴─────┴──────────┘
|
|
||||||
```
|
|
||||||
|
|
||||||
- Be careful about advertising character length limitations.
|
|
||||||
193
src/main.rs
193
src/main.rs
@ -1,13 +1,21 @@
|
|||||||
use clap::{Args, Parser};
|
use clap::Parser;
|
||||||
use serde::Serialize;
|
|
||||||
use tabled::{
|
use tabled::{
|
||||||
builder::Builder,
|
builder::Builder,
|
||||||
settings::{object::Rows, Alignment, Modify, Style},
|
settings::{Modify, object::Rows, Alignment, Style}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
#[derive(Parser)]
|
||||||
|
struct CliArgs {
|
||||||
|
name: String,
|
||||||
|
|
||||||
|
#[arg(short = 'j', long = "json", action)]
|
||||||
|
json: bool,
|
||||||
|
}
|
||||||
|
|
||||||
struct StringDetail {
|
struct StringDetail {
|
||||||
characters: Vec<CharacterDetail>,
|
characters: Vec<CharacterDetail>,
|
||||||
length: usize,
|
len: usize,
|
||||||
}
|
}
|
||||||
|
|
||||||
struct CharacterDetail {
|
struct CharacterDetail {
|
||||||
@ -16,31 +24,16 @@ struct CharacterDetail {
|
|||||||
byte: u8,
|
byte: u8,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl StringDetail {
|
impl StringDetail{
|
||||||
fn parse_utf8(query: &str) -> Self {
|
fn parse_utf8(query: &String) -> Self {
|
||||||
let mut details: StringDetail = StringDetail::default();
|
let mut details:StringDetail = StringDetail::default();
|
||||||
for i in query.chars() {
|
for i in query.chars() {
|
||||||
let mut bytes = [0; 4];
|
let mut bytes = [0; 4];
|
||||||
let encoded = i.encode_utf8(&mut bytes);
|
i.encode_utf8(&mut bytes);
|
||||||
|
|
||||||
let mut citer = vec![i].into_iter();
|
details.push(Some(i), bytes[0]);
|
||||||
for b in encoded.bytes() {
|
for b in 1..i.len_utf8() {
|
||||||
details.push(citer.next(), b);
|
details.push(None, bytes[b]);
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
details
|
|
||||||
}
|
|
||||||
|
|
||||||
fn parse_utf16(query: &String) -> Self {
|
|
||||||
let mut details: StringDetail = StringDetail::default();
|
|
||||||
for i in query.chars() {
|
|
||||||
let mut bytes = [0; 2];
|
|
||||||
let encoded = i.encode_utf16(&mut bytes);
|
|
||||||
|
|
||||||
let mut citer = vec![i].into_iter();
|
|
||||||
for b in encoded {
|
|
||||||
details.push_utf16(citer.next(), *b);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -48,48 +41,40 @@ impl StringDetail {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn default() -> Self {
|
fn default() -> Self {
|
||||||
Self {
|
Self { characters: Vec::new(), len: 0 }
|
||||||
characters: Vec::new(),
|
|
||||||
length: 0,
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn push(&mut self, character: Option<char>, byte: u8) {
|
fn push(&mut self, character:Option<char>, byte:u8){ self.characters
|
||||||
self.characters.push(CharacterDetail {
|
.push(CharacterDetail {
|
||||||
byte_index: self.length,
|
byte_index: self.len,
|
||||||
character,
|
character,
|
||||||
byte,
|
byte,
|
||||||
});
|
});
|
||||||
self.length += 1;
|
self.len += 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
fn push_utf16(&mut self, character: Option<char>, byte: u16) {
|
fn print_table(&self) {
|
||||||
let bytes = byte.to_be_bytes();
|
let mut table_builder = Builder::default();
|
||||||
self.push(character, bytes[0]);
|
table_builder.set_header(StringDetail::table_header());
|
||||||
self.push(None, bytes[1]);
|
for i in self.table_rows() {
|
||||||
|
table_builder.push_record(i);
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Serialize)]
|
let table = table_builder.build()
|
||||||
struct StringTable {
|
.with(Style::sharp())
|
||||||
characters: Vec<StringTableRow>,
|
.with(Modify::new(Rows::new(1..)).with(Alignment::left()))
|
||||||
length: usize,
|
.to_string();
|
||||||
}
|
print!("{}", table);
|
||||||
|
}
|
||||||
|
|
||||||
#[derive(Serialize)]
|
fn table_rows(&self) -> Vec<Vec<String>> {
|
||||||
struct StringTableRow {
|
self.characters.iter()
|
||||||
unicode: String,
|
.map(StringDetail::to_table_row)
|
||||||
unicode_hex: String,
|
.collect::<Vec<Vec<_>>>()
|
||||||
character: String,
|
}
|
||||||
byte: String,
|
|
||||||
hex: String,
|
|
||||||
dec: String,
|
|
||||||
bin: String,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl StringTableRow {
|
fn to_table_row(char_detail: &CharacterDetail) -> Vec<String> {
|
||||||
fn from(char_detail: &CharacterDetail) -> Self {
|
let empty = "<->";
|
||||||
let empty = "";
|
|
||||||
let mut character = String::from(empty);
|
let mut character = String::from(empty);
|
||||||
let mut unicode = String::from(empty);
|
let mut unicode = String::from(empty);
|
||||||
let mut unicode_hex = String::from(empty);
|
let mut unicode_hex = String::from(empty);
|
||||||
@ -99,25 +84,25 @@ impl StringTableRow {
|
|||||||
unicode = String::from(format!("{}", x as u32));
|
unicode = String::from(format!("{}", x as u32));
|
||||||
unicode_hex = String::from(format!("{:x}", x as u32));
|
unicode_hex = String::from(format!("{:x}", x as u32));
|
||||||
}
|
}
|
||||||
None => {}
|
None => {
|
||||||
|
}
|
||||||
};
|
};
|
||||||
let byte = format!("{}", char_detail.byte_index);
|
let byte = format!("{}", char_detail.byte_index);
|
||||||
let hex = format!("{:02x}", char_detail.byte);
|
let hex = format!("{:02x}", char_detail.byte);
|
||||||
let dec = format!("{}", char_detail.byte);
|
let dec = format!("{}", char_detail.byte);
|
||||||
let bin = format!("{:08b}", char_detail.byte);
|
let bin = format!("{:08b}", char_detail.byte);
|
||||||
|
|
||||||
StringTableRow {
|
vec![
|
||||||
unicode,
|
unicode,
|
||||||
unicode_hex,
|
unicode_hex,
|
||||||
character,
|
character,
|
||||||
byte,
|
byte,
|
||||||
hex,
|
hex,
|
||||||
dec,
|
dec,
|
||||||
bin,
|
bin]
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn header() -> Vec<String> {
|
fn table_header() -> Vec<String> {
|
||||||
vec![
|
vec![
|
||||||
String::from("U+dec"),
|
String::from("U+dec"),
|
||||||
String::from("U+hex"),
|
String::from("U+hex"),
|
||||||
@ -129,91 +114,13 @@ impl StringTableRow {
|
|||||||
]
|
]
|
||||||
}
|
}
|
||||||
|
|
||||||
fn to_table_row(self) -> Vec<String> {
|
|
||||||
vec![
|
|
||||||
self.unicode,
|
|
||||||
self.unicode_hex,
|
|
||||||
self.character,
|
|
||||||
self.byte,
|
|
||||||
self.hex,
|
|
||||||
self.dec,
|
|
||||||
self.bin,
|
|
||||||
]
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl StringTable {
|
|
||||||
fn from(string_details: &StringDetail) -> Self {
|
|
||||||
let characters = string_details
|
|
||||||
.characters
|
|
||||||
.iter()
|
|
||||||
.map(StringTableRow::from)
|
|
||||||
.collect::<Vec<StringTableRow>>();
|
|
||||||
|
|
||||||
StringTable {
|
|
||||||
characters,
|
|
||||||
length: string_details.length,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn as_table(self) -> String {
|
|
||||||
let mut table_builder = Builder::default();
|
|
||||||
table_builder.set_header(StringTableRow::header());
|
|
||||||
for i in self.characters {
|
|
||||||
table_builder.push_record(i.to_table_row());
|
|
||||||
}
|
|
||||||
|
|
||||||
let table = table_builder
|
|
||||||
.build()
|
|
||||||
.with(Style::sharp())
|
|
||||||
.with(Modify::new(Rows::new(1..)).with(Alignment::left()))
|
|
||||||
.to_string();
|
|
||||||
|
|
||||||
format!("{}", table)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn as_json(self) -> String {
|
|
||||||
format!("{}", serde_json::to_string(&self).unwrap())
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Parser)]
|
|
||||||
#[command(next_line_help = true)]
|
|
||||||
struct CliArgs {
|
|
||||||
/// The string to inspect
|
|
||||||
name: String,
|
|
||||||
|
|
||||||
#[command(flatten)]
|
|
||||||
inspect: InspectArgs,
|
|
||||||
|
|
||||||
/// Output as json. Useful as a command line tool
|
|
||||||
#[arg(short = 'j', long = "json", action)]
|
|
||||||
json: bool,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Args)]
|
|
||||||
#[group(required = true, multiple = false)]
|
|
||||||
struct InspectArgs {
|
|
||||||
/// Inspect utf-8
|
|
||||||
#[arg(short = '8', action)]
|
|
||||||
utf8: bool,
|
|
||||||
|
|
||||||
/// Inspect utf-16
|
|
||||||
#[arg(short = '6', action)]
|
|
||||||
utf16: bool,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn main() {
|
fn main() {
|
||||||
let cli = CliArgs::parse();
|
let cli = CliArgs::parse();
|
||||||
let details = match cli.inspect.utf8 {
|
let utf8 = StringDetail::parse_utf8(&cli.name);
|
||||||
true => StringDetail::parse_utf8(&cli.name),
|
|
||||||
false => StringDetail::parse_utf16(&cli.name),
|
|
||||||
};
|
|
||||||
|
|
||||||
let char_table = StringTable::from(&details);
|
|
||||||
|
|
||||||
match cli.json {
|
match cli.json {
|
||||||
false => println!("{}", char_table.as_table()),
|
false => utf8.print_table(),
|
||||||
true => println!("{}", char_table.as_json()),
|
_ => panic!("Not yet implemented!!"),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user