Module unicode

This module provides support to handle the Unicode UTF-8 encoding.

Types

RuneImpl = int32
  Source
Rune = distinct RuneImpl
type that can hold any Unicode character   Source
Rune16 = distinct int16
16 bit Unicode character   Source

Procs

proc `<=%`(a, b: Rune): bool {.raises: [], tags: [].}
  Source
proc `<%`(a, b: Rune): bool {.raises: [], tags: [].}
  Source
proc `==`(a, b: Rune): bool {.raises: [], tags: [].}
  Source
proc runeLen(s: string): int {.gcsafe, extern: "nuc$1", raises: [], tags: [].}
Returns the number of Unicode characters of the string s   Source
proc runeLenAt(s: string; i: Natural): int {.raises: [], tags: [].}
Returns the number of bytes the rune starting at s[i] takes   Source
proc validateUtf8(s: string): int {.raises: [], tags: [].}
Returns the position of the invalid byte in s if the string s does not hold valid UTF-8 data. Otherwise -1 is returned.   Source
proc runeAt(s: string; i: Natural): Rune {.raises: [], tags: [].}
Returns the unicode character in s at byte index i   Source
proc toUTF8(c: Rune): string {.gcsafe, extern: "nuc$1", raises: [], tags: [].}
Converts a rune into its UTF-8 representation   Source
proc `$`(rune: Rune): string {.raises: [], tags: [].}
Converts a Rune to a string   Source
proc `$`(runes: seq[Rune]): string {.raises: [], tags: [].}
Converts a sequence of Runes to a string   Source
proc runeOffset(s: string; pos: Natural; start: Natural = 0): int {.raises: [], tags: [].}

Returns the byte position of unicode character at position pos in s with an optional start byte position. returns the special value -1 if it runs out of the string

Beware: This can lead to unoptimized code and slow execution! Most problems are solve more efficient by using an iterator or conversion to a seq of Rune.

  Source
proc runeAtPos(s: string; pos: int): Rune {.raises: [], tags: [].}

Returns the unicode character at position pos

Beware: This can lead to unoptimized code and slow execution! Most problems are solve more efficient by using an iterator or conversion to a seq of Rune.

  Source
proc runeStrAtPos(s: string; pos: Natural): string {.raises: [], tags: [].}

Returns the unicode character at position pos as UTF8 String

Beware: This can lead to unoptimized code and slow execution! Most problems are solve more efficient by using an iterator or conversion to a seq of Rune.

  Source
proc runeReverseOffset(s: string; rev: Positive): (int, int) {.raises: [], tags: [].}

Returns a tuple with the the byte offset of the unicode character at position rev in s counting from the end (starting with 1) and the total number of runes in the string. Returns a negative value for offset if there are to few runes in the string to satisfy the request.

Beware: This can lead to unoptimized code and slow execution! Most problems are solve more efficient by using an iterator or conversion to a seq of Rune.

  Source
proc runeSubStr(s: string; pos: int; len: int = int.high): string {.raises: [], tags: [].}

Returns the UTF-8 substring starting at codepoint pos with len codepoints. If pos or len is negativ they count from the end of the string. If len is not given it means the longest possible string.

(Needs some examples)

  Source
proc toLower(c: Rune): Rune {.gcsafe, extern: "nuc$1", procvar, raises: [], tags: [].}
Converts c into lower case. This works for any Unicode character. If possible, prefer toLower over toUpper.   Source
proc toUpper(c: Rune): Rune {.gcsafe, extern: "nuc$1", procvar, raises: [], tags: [].}
Converts c into upper case. This works for any Unicode character. If possible, prefer toLower over toUpper.   Source
proc toTitle(c: Rune): Rune {.gcsafe, extern: "nuc$1", procvar, raises: [], tags: [].}
Converts c to title case   Source
proc isLower(c: Rune): bool {.gcsafe, extern: "nuc$1", procvar, raises: [], tags: [].}
Returns true iff c is a lower case Unicode character. If possible, prefer isLower over isUpper.   Source
proc isUpper(c: Rune): bool {.gcsafe, extern: "nuc$1", procvar, raises: [], tags: [].}
Returns true iff c is a upper case Unicode character. If possible, prefer isLower over isUpper.   Source
proc isAlpha(c: Rune): bool {.gcsafe, extern: "nuc$1", procvar, raises: [], tags: [].}
Returns true iff c is an alpha Unicode character (i.e., a letter)   Source
proc isTitle(c: Rune): bool {.gcsafe, extern: "nuc$1", procvar, raises: [], tags: [].}
Returns true iff c is a Unicode titlecase character   Source
proc isWhiteSpace(c: Rune): bool {.gcsafe, extern: "nuc$1", procvar, raises: [], tags: [].}
Returns true iff c is a Unicode whitespace character   Source
proc isCombining(c: Rune): bool {.gcsafe, extern: "nuc$1", procvar, raises: [], tags: [].}
Returns true iff c is a Unicode combining character   Source
proc toRunes(s: string): seq[Rune] {.raises: [], tags: [].}
Obtains a sequence containing the Runes in s   Source
proc cmpRunesIgnoreCase(a, b: string): int {.gcsafe, extern: "nuc$1", procvar,
                                        raises: [], tags: [].}
Compares two UTF-8 strings and ignores the case. Returns:

0 iff a == b
< 0 iff a < b
> 0 iff a > b

  Source
proc reversed(s: string): string {.raises: [], tags: [].}
Returns the reverse of s, interpreting it as Unicode characters. Unicode combining characters are correctly interpreted as well:
assert reversed("Reverse this!") == "!siht esreveR"
assert reversed("先秦兩漢") == "漢兩秦先"
assert reversed("as⃝df̅") == "f̅ds⃝a"
assert reversed("a⃞b⃞c⃞") == "c⃞b⃞a⃞"
  Source
proc graphemeLen(s: string; i: Natural): Natural {.raises: [], tags: [].}
The number of bytes belonging to 's[i]' including following combining characters.   Source
proc lastRune(s: string; last: int): (Rune, int) {.raises: [], tags: [].}
length of the last rune in 's[0..last]'. Returns the rune and its length in bytes.   Source

Iterators

iterator runes(s: string): Rune {.raises: [], tags: [].}
Iterates over any unicode character of the string s   Source

Templates

template fastRuneAt(s: string; i: int; result: expr; doInc = true)
Returns the Unicode character s[i] in result. If doInc == true i is incremented by the number of bytes that have been processed.   Source