Update go-cty to pull in more stdlib funcs.
I needed "split" specifically so I can do something like: ```hcl variable PLATFORMS { default = "linux/amd64" } target foo { platforms = split(",", "${PLATFORMS}") # other stuff } ``` Where the existing "csvdecode" does not work for this because it parses the string into a list of objects instead of a list of strings. I went ahead and just added all the available new functions. Signed-off-by: Brian Goff <cpuguy83@gmail.com>pull/277/head
parent
bda4882a65
commit
1ad87c6ba6
@ -1,95 +0,0 @@
|
|||||||
Copyright (c) 2017 Martin Atkins
|
|
||||||
|
|
||||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
||||||
of this software and associated documentation files (the "Software"), to deal
|
|
||||||
in the Software without restriction, including without limitation the rights
|
|
||||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
||||||
copies of the Software, and to permit persons to whom the Software is
|
|
||||||
furnished to do so, subject to the following conditions:
|
|
||||||
|
|
||||||
The above copyright notice and this permission notice shall be included in all
|
|
||||||
copies or substantial portions of the Software.
|
|
||||||
|
|
||||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
||||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
||||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
||||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
||||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
||||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
||||||
SOFTWARE.
|
|
||||||
|
|
||||||
---------
|
|
||||||
|
|
||||||
Unicode table generation programs are under a separate copyright and license:
|
|
||||||
|
|
||||||
Copyright (c) 2014 Couchbase, Inc.
|
|
||||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
|
||||||
except in compliance with the License. You may obtain a copy of the License at
|
|
||||||
|
|
||||||
http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
|
|
||||||
Unless required by applicable law or agreed to in writing, software distributed under the
|
|
||||||
License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
|
||||||
either express or implied. See the License for the specific language governing permissions
|
|
||||||
and limitations under the License.
|
|
||||||
|
|
||||||
---------
|
|
||||||
|
|
||||||
Grapheme break data is provided as part of the Unicode character database,
|
|
||||||
copright 2016 Unicode, Inc, which is provided with the following license:
|
|
||||||
|
|
||||||
Unicode Data Files include all data files under the directories
|
|
||||||
http://www.unicode.org/Public/, http://www.unicode.org/reports/,
|
|
||||||
http://www.unicode.org/cldr/data/, http://source.icu-project.org/repos/icu/, and
|
|
||||||
http://www.unicode.org/utility/trac/browser/.
|
|
||||||
|
|
||||||
Unicode Data Files do not include PDF online code charts under the
|
|
||||||
directory http://www.unicode.org/Public/.
|
|
||||||
|
|
||||||
Software includes any source code published in the Unicode Standard
|
|
||||||
or under the directories
|
|
||||||
http://www.unicode.org/Public/, http://www.unicode.org/reports/,
|
|
||||||
http://www.unicode.org/cldr/data/, http://source.icu-project.org/repos/icu/, and
|
|
||||||
http://www.unicode.org/utility/trac/browser/.
|
|
||||||
|
|
||||||
NOTICE TO USER: Carefully read the following legal agreement.
|
|
||||||
BY DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING UNICODE INC.'S
|
|
||||||
DATA FILES ("DATA FILES"), AND/OR SOFTWARE ("SOFTWARE"),
|
|
||||||
YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE
|
|
||||||
TERMS AND CONDITIONS OF THIS AGREEMENT.
|
|
||||||
IF YOU DO NOT AGREE, DO NOT DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE
|
|
||||||
THE DATA FILES OR SOFTWARE.
|
|
||||||
|
|
||||||
COPYRIGHT AND PERMISSION NOTICE
|
|
||||||
|
|
||||||
Copyright © 1991-2017 Unicode, Inc. All rights reserved.
|
|
||||||
Distributed under the Terms of Use in http://www.unicode.org/copyright.html.
|
|
||||||
|
|
||||||
Permission is hereby granted, free of charge, to any person obtaining
|
|
||||||
a copy of the Unicode data files and any associated documentation
|
|
||||||
(the "Data Files") or Unicode software and any associated documentation
|
|
||||||
(the "Software") to deal in the Data Files or Software
|
|
||||||
without restriction, including without limitation the rights to use,
|
|
||||||
copy, modify, merge, publish, distribute, and/or sell copies of
|
|
||||||
the Data Files or Software, and to permit persons to whom the Data Files
|
|
||||||
or Software are furnished to do so, provided that either
|
|
||||||
(a) this copyright and permission notice appear with all copies
|
|
||||||
of the Data Files or Software, or
|
|
||||||
(b) this copyright and permission notice appear in associated
|
|
||||||
Documentation.
|
|
||||||
|
|
||||||
THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF
|
|
||||||
ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
|
|
||||||
WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
|
||||||
NONINFRINGEMENT OF THIRD PARTY RIGHTS.
|
|
||||||
IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS
|
|
||||||
NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL
|
|
||||||
DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
|
|
||||||
DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
|
|
||||||
TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
|
|
||||||
PERFORMANCE OF THE DATA FILES OR SOFTWARE.
|
|
||||||
|
|
||||||
Except as contained in this notice, the name of a copyright holder
|
|
||||||
shall not be used in advertising or otherwise to promote the sale,
|
|
||||||
use or other dealings in these Data Files or Software without prior
|
|
||||||
written authorization of the copyright holder.
|
|
@ -1,30 +0,0 @@
|
|||||||
package textseg
|
|
||||||
|
|
||||||
import (
|
|
||||||
"bufio"
|
|
||||||
"bytes"
|
|
||||||
)
|
|
||||||
|
|
||||||
// AllTokens is a utility that uses a bufio.SplitFunc to produce a slice of
|
|
||||||
// all of the recognized tokens in the given buffer.
|
|
||||||
func AllTokens(buf []byte, splitFunc bufio.SplitFunc) ([][]byte, error) {
|
|
||||||
scanner := bufio.NewScanner(bytes.NewReader(buf))
|
|
||||||
scanner.Split(splitFunc)
|
|
||||||
var ret [][]byte
|
|
||||||
for scanner.Scan() {
|
|
||||||
ret = append(ret, scanner.Bytes())
|
|
||||||
}
|
|
||||||
return ret, scanner.Err()
|
|
||||||
}
|
|
||||||
|
|
||||||
// TokenCount is a utility that uses a bufio.SplitFunc to count the number of
|
|
||||||
// recognized tokens in the given buffer.
|
|
||||||
func TokenCount(buf []byte, splitFunc bufio.SplitFunc) (int, error) {
|
|
||||||
scanner := bufio.NewScanner(bytes.NewReader(buf))
|
|
||||||
scanner.Split(splitFunc)
|
|
||||||
var ret int
|
|
||||||
for scanner.Scan() {
|
|
||||||
ret++
|
|
||||||
}
|
|
||||||
return ret, scanner.Err()
|
|
||||||
}
|
|
@ -1,7 +0,0 @@
|
|||||||
package textseg
|
|
||||||
|
|
||||||
//go:generate go run make_tables.go -output tables.go
|
|
||||||
//go:generate go run make_test_tables.go -output tables_test.go
|
|
||||||
//go:generate ruby unicode2ragel.rb --url=http://www.unicode.org/Public/9.0.0/ucd/auxiliary/GraphemeBreakProperty.txt -m GraphemeCluster -p "Prepend,CR,LF,Control,Extend,Regional_Indicator,SpacingMark,L,V,T,LV,LVT,E_Base,E_Modifier,ZWJ,Glue_After_Zwj,E_Base_GAZ" -o grapheme_clusters_table.rl
|
|
||||||
//go:generate ragel -Z grapheme_clusters.rl
|
|
||||||
//go:generate gofmt -w grapheme_clusters.go
|
|
File diff suppressed because it is too large
Load Diff
@ -1,132 +0,0 @@
|
|||||||
package textseg
|
|
||||||
|
|
||||||
import (
|
|
||||||
"errors"
|
|
||||||
"unicode/utf8"
|
|
||||||
)
|
|
||||||
|
|
||||||
// Generated from grapheme_clusters.rl. DO NOT EDIT
|
|
||||||
%%{
|
|
||||||
# (except you are actually in grapheme_clusters.rl here, so edit away!)
|
|
||||||
|
|
||||||
machine graphclust;
|
|
||||||
write data;
|
|
||||||
}%%
|
|
||||||
|
|
||||||
var Error = errors.New("invalid UTF8 text")
|
|
||||||
|
|
||||||
// ScanGraphemeClusters is a split function for bufio.Scanner that splits
|
|
||||||
// on grapheme cluster boundaries.
|
|
||||||
func ScanGraphemeClusters(data []byte, atEOF bool) (int, []byte, error) {
|
|
||||||
if len(data) == 0 {
|
|
||||||
return 0, nil, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// Ragel state
|
|
||||||
cs := 0 // Current State
|
|
||||||
p := 0 // "Pointer" into data
|
|
||||||
pe := len(data) // End-of-data "pointer"
|
|
||||||
ts := 0
|
|
||||||
te := 0
|
|
||||||
act := 0
|
|
||||||
eof := pe
|
|
||||||
|
|
||||||
// Make Go compiler happy
|
|
||||||
_ = ts
|
|
||||||
_ = te
|
|
||||||
_ = act
|
|
||||||
_ = eof
|
|
||||||
|
|
||||||
startPos := 0
|
|
||||||
endPos := 0
|
|
||||||
|
|
||||||
%%{
|
|
||||||
include GraphemeCluster "grapheme_clusters_table.rl";
|
|
||||||
|
|
||||||
action start {
|
|
||||||
startPos = p
|
|
||||||
}
|
|
||||||
|
|
||||||
action end {
|
|
||||||
endPos = p
|
|
||||||
}
|
|
||||||
|
|
||||||
action emit {
|
|
||||||
return endPos+1, data[startPos:endPos+1], nil
|
|
||||||
}
|
|
||||||
|
|
||||||
ZWJGlue = ZWJ (Glue_After_Zwj | E_Base_GAZ Extend* E_Modifier?)?;
|
|
||||||
AnyExtender = Extend | ZWJGlue | SpacingMark;
|
|
||||||
Extension = AnyExtender*;
|
|
||||||
ReplacementChar = (0xEF 0xBF 0xBD);
|
|
||||||
|
|
||||||
CRLFSeq = CR LF;
|
|
||||||
ControlSeq = Control | ReplacementChar;
|
|
||||||
HangulSeq = (
|
|
||||||
L+ (((LV? V+ | LVT) T*)?|LV?) |
|
|
||||||
LV V* T* |
|
|
||||||
V+ T* |
|
|
||||||
LVT T* |
|
|
||||||
T+
|
|
||||||
) Extension;
|
|
||||||
EmojiSeq = (E_Base | E_Base_GAZ) Extend* E_Modifier? Extension;
|
|
||||||
ZWJSeq = ZWJGlue Extension;
|
|
||||||
EmojiFlagSeq = Regional_Indicator Regional_Indicator? Extension;
|
|
||||||
|
|
||||||
UTF8Cont = 0x80 .. 0xBF;
|
|
||||||
AnyUTF8 = (
|
|
||||||
0x00..0x7F |
|
|
||||||
0xC0..0xDF . UTF8Cont |
|
|
||||||
0xE0..0xEF . UTF8Cont . UTF8Cont |
|
|
||||||
0xF0..0xF7 . UTF8Cont . UTF8Cont . UTF8Cont
|
|
||||||
);
|
|
||||||
|
|
||||||
# OtherSeq is any character that isn't at the start of one of the extended sequences above, followed by extension
|
|
||||||
OtherSeq = (AnyUTF8 - (CR|LF|Control|ReplacementChar|L|LV|V|LVT|T|E_Base|E_Base_GAZ|ZWJ|Regional_Indicator|Prepend)) Extension;
|
|
||||||
|
|
||||||
# PrependSeq is prepend followed by any of the other patterns above, except control characters which explicitly break
|
|
||||||
PrependSeq = Prepend+ (HangulSeq|EmojiSeq|ZWJSeq|EmojiFlagSeq|OtherSeq)?;
|
|
||||||
|
|
||||||
CRLFTok = CRLFSeq >start @end;
|
|
||||||
ControlTok = ControlSeq >start @end;
|
|
||||||
HangulTok = HangulSeq >start @end;
|
|
||||||
EmojiTok = EmojiSeq >start @end;
|
|
||||||
ZWJTok = ZWJSeq >start @end;
|
|
||||||
EmojiFlagTok = EmojiFlagSeq >start @end;
|
|
||||||
OtherTok = OtherSeq >start @end;
|
|
||||||
PrependTok = PrependSeq >start @end;
|
|
||||||
|
|
||||||
main := |*
|
|
||||||
CRLFTok => emit;
|
|
||||||
ControlTok => emit;
|
|
||||||
HangulTok => emit;
|
|
||||||
EmojiTok => emit;
|
|
||||||
ZWJTok => emit;
|
|
||||||
EmojiFlagTok => emit;
|
|
||||||
PrependTok => emit;
|
|
||||||
OtherTok => emit;
|
|
||||||
|
|
||||||
# any single valid UTF-8 character would also be valid per spec,
|
|
||||||
# but we'll handle that separately after the loop so we can deal
|
|
||||||
# with requesting more bytes if we're not at EOF.
|
|
||||||
*|;
|
|
||||||
|
|
||||||
write init;
|
|
||||||
write exec;
|
|
||||||
}%%
|
|
||||||
|
|
||||||
// If we fall out here then we were unable to complete a sequence.
|
|
||||||
// If we weren't able to complete a sequence then either we've
|
|
||||||
// reached the end of a partial buffer (so there's more data to come)
|
|
||||||
// or we have an isolated symbol that would normally be part of a
|
|
||||||
// grapheme cluster but has appeared in isolation here.
|
|
||||||
|
|
||||||
if !atEOF {
|
|
||||||
// Request more
|
|
||||||
return 0, nil, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// Just take the first UTF-8 sequence and return that.
|
|
||||||
_, seqLen := utf8.DecodeRune(data)
|
|
||||||
return seqLen, data[:seqLen], nil
|
|
||||||
}
|
|
1583
vendor/github.com/apparentlymart/go-textseg/textseg/grapheme_clusters_table.rl
generated
vendored
1583
vendor/github.com/apparentlymart/go-textseg/textseg/grapheme_clusters_table.rl
generated
vendored
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -1,335 +0,0 @@
|
|||||||
#!/usr/bin/env ruby
|
|
||||||
#
|
|
||||||
# This scripted has been updated to accept more command-line arguments:
|
|
||||||
#
|
|
||||||
# -u, --url URL to process
|
|
||||||
# -m, --machine Machine name
|
|
||||||
# -p, --properties Properties to add to the machine
|
|
||||||
# -o, --output Write output to file
|
|
||||||
#
|
|
||||||
# Updated by: Marty Schoch <marty.schoch@gmail.com>
|
|
||||||
#
|
|
||||||
# This script uses the unicode spec to generate a Ragel state machine
|
|
||||||
# that recognizes unicode alphanumeric characters. It generates 5
|
|
||||||
# character classes: uupper, ulower, ualpha, udigit, and ualnum.
|
|
||||||
# Currently supported encodings are UTF-8 [default] and UCS-4.
|
|
||||||
#
|
|
||||||
# Usage: unicode2ragel.rb [options]
|
|
||||||
# -e, --encoding [ucs4 | utf8] Data encoding
|
|
||||||
# -h, --help Show this message
|
|
||||||
#
|
|
||||||
# This script was originally written as part of the Ferret search
|
|
||||||
# engine library.
|
|
||||||
#
|
|
||||||
# Author: Rakan El-Khalil <rakan@well.com>
|
|
||||||
|
|
||||||
require 'optparse'
|
|
||||||
require 'open-uri'
|
|
||||||
|
|
||||||
ENCODINGS = [ :utf8, :ucs4 ]
|
|
||||||
ALPHTYPES = { :utf8 => "byte", :ucs4 => "rune" }
|
|
||||||
DEFAULT_CHART_URL = "http://www.unicode.org/Public/5.1.0/ucd/DerivedCoreProperties.txt"
|
|
||||||
DEFAULT_MACHINE_NAME= "WChar"
|
|
||||||
|
|
||||||
###
|
|
||||||
# Display vars & default option
|
|
||||||
|
|
||||||
TOTAL_WIDTH = 80
|
|
||||||
RANGE_WIDTH = 23
|
|
||||||
@encoding = :utf8
|
|
||||||
@chart_url = DEFAULT_CHART_URL
|
|
||||||
machine_name = DEFAULT_MACHINE_NAME
|
|
||||||
properties = []
|
|
||||||
@output = $stdout
|
|
||||||
|
|
||||||
###
|
|
||||||
# Option parsing
|
|
||||||
|
|
||||||
cli_opts = OptionParser.new do |opts|
|
|
||||||
opts.on("-e", "--encoding [ucs4 | utf8]", "Data encoding") do |o|
|
|
||||||
@encoding = o.downcase.to_sym
|
|
||||||
end
|
|
||||||
opts.on("-h", "--help", "Show this message") do
|
|
||||||
puts opts
|
|
||||||
exit
|
|
||||||
end
|
|
||||||
opts.on("-u", "--url URL", "URL to process") do |o|
|
|
||||||
@chart_url = o
|
|
||||||
end
|
|
||||||
opts.on("-m", "--machine MACHINE_NAME", "Machine name") do |o|
|
|
||||||
machine_name = o
|
|
||||||
end
|
|
||||||
opts.on("-p", "--properties x,y,z", Array, "Properties to add to machine") do |o|
|
|
||||||
properties = o
|
|
||||||
end
|
|
||||||
opts.on("-o", "--output FILE", "output file") do |o|
|
|
||||||
@output = File.new(o, "w+")
|
|
||||||
end
|
|
||||||
end
|
|
||||||
|
|
||||||
cli_opts.parse(ARGV)
|
|
||||||
unless ENCODINGS.member? @encoding
|
|
||||||
puts "Invalid encoding: #{@encoding}"
|
|
||||||
puts cli_opts
|
|
||||||
exit
|
|
||||||
end
|
|
||||||
|
|
||||||
##
|
|
||||||
# Downloads the document at url and yields every alpha line's hex
|
|
||||||
# range and description.
|
|
||||||
|
|
||||||
def each_alpha( url, property )
|
|
||||||
open( url ) do |file|
|
|
||||||
file.each_line do |line|
|
|
||||||
next if line =~ /^#/;
|
|
||||||
next if line !~ /; #{property} #/;
|
|
||||||
|
|
||||||
range, description = line.split(/;/)
|
|
||||||
range.strip!
|
|
||||||
description.gsub!(/.*#/, '').strip!
|
|
||||||
|
|
||||||
if range =~ /\.\./
|
|
||||||
start, stop = range.split '..'
|
|
||||||
else start = stop = range
|
|
||||||
end
|
|
||||||
|
|
||||||
yield start.hex .. stop.hex, description
|
|
||||||
end
|
|
||||||
end
|
|
||||||
end
|
|
||||||
|
|
||||||
###
|
|
||||||
# Formats to hex at minimum width
|
|
||||||
|
|
||||||
def to_hex( n )
|
|
||||||
r = "%0X" % n
|
|
||||||
r = "0#{r}" unless (r.length % 2).zero?
|
|
||||||
r
|
|
||||||
end
|
|
||||||
|
|
||||||
###
|
|
||||||
# UCS4 is just a straight hex conversion of the unicode codepoint.
|
|
||||||
|
|
||||||
def to_ucs4( range )
|
|
||||||
rangestr = "0x" + to_hex(range.begin)
|
|
||||||
rangestr << "..0x" + to_hex(range.end) if range.begin != range.end
|
|
||||||
[ rangestr ]
|
|
||||||
end
|
|
||||||
|
|
||||||
##
|
|
||||||
# 0x00 - 0x7f -> 0zzzzzzz[7]
|
|
||||||
# 0x80 - 0x7ff -> 110yyyyy[5] 10zzzzzz[6]
|
|
||||||
# 0x800 - 0xffff -> 1110xxxx[4] 10yyyyyy[6] 10zzzzzz[6]
|
|
||||||
# 0x010000 - 0x10ffff -> 11110www[3] 10xxxxxx[6] 10yyyyyy[6] 10zzzzzz[6]
|
|
||||||
|
|
||||||
UTF8_BOUNDARIES = [0x7f, 0x7ff, 0xffff, 0x10ffff]
|
|
||||||
|
|
||||||
def to_utf8_enc( n )
|
|
||||||
r = 0
|
|
||||||
if n <= 0x7f
|
|
||||||
r = n
|
|
||||||
elsif n <= 0x7ff
|
|
||||||
y = 0xc0 | (n >> 6)
|
|
||||||
z = 0x80 | (n & 0x3f)
|
|
||||||
r = y << 8 | z
|
|
||||||
elsif n <= 0xffff
|
|
||||||
x = 0xe0 | (n >> 12)
|
|
||||||
y = 0x80 | (n >> 6) & 0x3f
|
|
||||||
z = 0x80 | n & 0x3f
|
|
||||||
r = x << 16 | y << 8 | z
|
|
||||||
elsif n <= 0x10ffff
|
|
||||||
w = 0xf0 | (n >> 18)
|
|
||||||
x = 0x80 | (n >> 12) & 0x3f
|
|
||||||
y = 0x80 | (n >> 6) & 0x3f
|
|
||||||
z = 0x80 | n & 0x3f
|
|
||||||
r = w << 24 | x << 16 | y << 8 | z
|
|
||||||
end
|
|
||||||
|
|
||||||
to_hex(r)
|
|
||||||
end
|
|
||||||
|
|
||||||
def from_utf8_enc( n )
|
|
||||||
n = n.hex
|
|
||||||
r = 0
|
|
||||||
if n <= 0x7f
|
|
||||||
r = n
|
|
||||||
elsif n <= 0xdfff
|
|
||||||
y = (n >> 8) & 0x1f
|
|
||||||
z = n & 0x3f
|
|
||||||
r = y << 6 | z
|
|
||||||
elsif n <= 0xefffff
|
|
||||||
x = (n >> 16) & 0x0f
|
|
||||||
y = (n >> 8) & 0x3f
|
|
||||||
z = n & 0x3f
|
|
||||||
r = x << 10 | y << 6 | z
|
|
||||||
elsif n <= 0xf7ffffff
|
|
||||||
w = (n >> 24) & 0x07
|
|
||||||
x = (n >> 16) & 0x3f
|
|
||||||
y = (n >> 8) & 0x3f
|
|
||||||
z = n & 0x3f
|
|
||||||
r = w << 18 | x << 12 | y << 6 | z
|
|
||||||
end
|
|
||||||
r
|
|
||||||
end
|
|
||||||
|
|
||||||
###
|
|
||||||
# Given a range, splits it up into ranges that can be continuously
|
|
||||||
# encoded into utf8. Eg: 0x00 .. 0xff => [0x00..0x7f, 0x80..0xff]
|
|
||||||
# This is not strictly needed since the current [5.1] unicode standard
|
|
||||||
# doesn't have ranges that straddle utf8 boundaries. This is included
|
|
||||||
# for completeness as there is no telling if that will ever change.
|
|
||||||
|
|
||||||
def utf8_ranges( range )
|
|
||||||
ranges = []
|
|
||||||
UTF8_BOUNDARIES.each do |max|
|
|
||||||
if range.begin <= max
|
|
||||||
if range.end <= max
|
|
||||||
ranges << range
|
|
||||||
return ranges
|
|
||||||
end
|
|
||||||
|
|
||||||
ranges << (range.begin .. max)
|
|
||||||
range = (max + 1) .. range.end
|
|
||||||
end
|
|
||||||
end
|
|
||||||
ranges
|
|
||||||
end
|
|
||||||
|
|
||||||
def build_range( start, stop )
|
|
||||||
size = start.size/2
|
|
||||||
left = size - 1
|
|
||||||
return [""] if size < 1
|
|
||||||
|
|
||||||
a = start[0..1]
|
|
||||||
b = stop[0..1]
|
|
||||||
|
|
||||||
###
|
|
||||||
# Shared prefix
|
|
||||||
|
|
||||||
if a == b
|
|
||||||
return build_range(start[2..-1], stop[2..-1]).map do |elt|
|
|
||||||
"0x#{a} " + elt
|
|
||||||
end
|
|
||||||
end
|
|
||||||
|
|
||||||
###
|
|
||||||
# Unshared prefix, end of run
|
|
||||||
|
|
||||||
return ["0x#{a}..0x#{b} "] if left.zero?
|
|
||||||
|
|
||||||
###
|
|
||||||
# Unshared prefix, not end of run
|
|
||||||
# Range can be 0x123456..0x56789A
|
|
||||||
# Which is equivalent to:
|
|
||||||
# 0x123456 .. 0x12FFFF
|
|
||||||
# 0x130000 .. 0x55FFFF
|
|
||||||
# 0x560000 .. 0x56789A
|
|
||||||
|
|
||||||
ret = []
|
|
||||||
ret << build_range(start, a + "FF" * left)
|
|
||||||
|
|
||||||
###
|
|
||||||
# Only generate middle range if need be.
|
|
||||||
|
|
||||||
if a.hex+1 != b.hex
|
|
||||||
max = to_hex(b.hex - 1)
|
|
||||||
max = "FF" if b == "FF"
|
|
||||||
ret << "0x#{to_hex(a.hex+1)}..0x#{max} " + "0x00..0xFF " * left
|
|
||||||
end
|
|
||||||
|
|
||||||
###
|
|
||||||
# Don't generate last range if it is covered by first range
|
|
||||||
|
|
||||||
ret << build_range(b + "00" * left, stop) unless b == "FF"
|
|
||||||
ret.flatten!
|
|
||||||
end
|
|
||||||
|
|
||||||
def to_utf8( range )
|
|
||||||
utf8_ranges( range ).map do |r|
|
|
||||||
begin_enc = to_utf8_enc(r.begin)
|
|
||||||
end_enc = to_utf8_enc(r.end)
|
|
||||||
build_range begin_enc, end_enc
|
|
||||||
end.flatten!
|
|
||||||
end
|
|
||||||
|
|
||||||
##
|
|
||||||
# Perform a 3-way comparison of the number of codepoints advertised by
|
|
||||||
# the unicode spec for the given range, the originally parsed range,
|
|
||||||
# and the resulting utf8 encoded range.
|
|
||||||
|
|
||||||
def count_codepoints( code )
|
|
||||||
code.split(' ').inject(1) do |acc, elt|
|
|
||||||
if elt =~ /0x(.+)\.\.0x(.+)/
|
|
||||||
if @encoding == :utf8
|
|
||||||
acc * (from_utf8_enc($2) - from_utf8_enc($1) + 1)
|
|
||||||
else
|
|
||||||
acc * ($2.hex - $1.hex + 1)
|
|
||||||
end
|
|
||||||
else
|
|
||||||
acc
|
|
||||||
end
|
|
||||||
end
|
|
||||||
end
|
|
||||||
|
|
||||||
def is_valid?( range, desc, codes )
|
|
||||||
spec_count = 1
|
|
||||||
spec_count = $1.to_i if desc =~ /\[(\d+)\]/
|
|
||||||
range_count = range.end - range.begin + 1
|
|
||||||
|
|
||||||
sum = codes.inject(0) { |acc, elt| acc + count_codepoints(elt) }
|
|
||||||
sum == spec_count and sum == range_count
|
|
||||||
end
|
|
||||||
|
|
||||||
##
|
|
||||||
# Generate the state maching to stdout
|
|
||||||
|
|
||||||
def generate_machine( name, property )
|
|
||||||
pipe = " "
|
|
||||||
@output.puts " #{name} = "
|
|
||||||
each_alpha( @chart_url, property ) do |range, desc|
|
|
||||||
|
|
||||||
codes = (@encoding == :ucs4) ? to_ucs4(range) : to_utf8(range)
|
|
||||||
|
|
||||||
#raise "Invalid encoding of range #{range}: #{codes.inspect}" unless
|
|
||||||
# is_valid? range, desc, codes
|
|
||||||
|
|
||||||
range_width = codes.map { |a| a.size }.max
|
|
||||||
range_width = RANGE_WIDTH if range_width < RANGE_WIDTH
|
|
||||||
|
|
||||||
desc_width = TOTAL_WIDTH - RANGE_WIDTH - 11
|
|
||||||
desc_width -= (range_width - RANGE_WIDTH) if range_width > RANGE_WIDTH
|
|
||||||
|
|
||||||
if desc.size > desc_width
|
|
||||||
desc = desc[0..desc_width - 4] + "..."
|
|
||||||
end
|
|
||||||
|
|
||||||
codes.each_with_index do |r, idx|
|
|
||||||
desc = "" unless idx.zero?
|
|
||||||
code = "%-#{range_width}s" % r
|
|
||||||
@output.puts " #{pipe} #{code} ##{desc}"
|
|
||||||
pipe = "|"
|
|
||||||
end
|
|
||||||
end
|
|
||||||
@output.puts " ;"
|
|
||||||
@output.puts ""
|
|
||||||
end
|
|
||||||
|
|
||||||
@output.puts <<EOF
|
|
||||||
# The following Ragel file was autogenerated with #{$0}
|
|
||||||
# from: #{@chart_url}
|
|
||||||
#
|
|
||||||
# It defines #{properties}.
|
|
||||||
#
|
|
||||||
# To use this, make sure that your alphtype is set to #{ALPHTYPES[@encoding]},
|
|
||||||
# and that your input is in #{@encoding}.
|
|
||||||
|
|
||||||
%%{
|
|
||||||
machine #{machine_name};
|
|
||||||
|
|
||||||
EOF
|
|
||||||
|
|
||||||
properties.each { |x| generate_machine( x, x ) }
|
|
||||||
|
|
||||||
@output.puts <<EOF
|
|
||||||
}%%
|
|
||||||
EOF
|
|
@ -1,19 +0,0 @@
|
|||||||
package textseg
|
|
||||||
|
|
||||||
import "unicode/utf8"
|
|
||||||
|
|
||||||
// ScanGraphemeClusters is a split function for bufio.Scanner that splits
|
|
||||||
// on UTF8 sequence boundaries.
|
|
||||||
//
|
|
||||||
// This is included largely for completeness, since this behavior is already
|
|
||||||
// built in to Go when ranging over a string.
|
|
||||||
func ScanUTF8Sequences(data []byte, atEOF bool) (int, []byte, error) {
|
|
||||||
if len(data) == 0 {
|
|
||||||
return 0, nil, nil
|
|
||||||
}
|
|
||||||
r, seqLen := utf8.DecodeRune(data)
|
|
||||||
if r == utf8.RuneError && !atEOF {
|
|
||||||
return 0, nil, nil
|
|
||||||
}
|
|
||||||
return seqLen, data[:seqLen], nil
|
|
||||||
}
|
|
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,87 @@
|
|||||||
|
package stdlib
|
||||||
|
|
||||||
|
import (
|
||||||
|
"strconv"
|
||||||
|
|
||||||
|
"github.com/zclconf/go-cty/cty"
|
||||||
|
"github.com/zclconf/go-cty/cty/convert"
|
||||||
|
"github.com/zclconf/go-cty/cty/function"
|
||||||
|
)
|
||||||
|
|
||||||
|
// MakeToFunc constructs a "to..." function, like "tostring", which converts
|
||||||
|
// its argument to a specific type or type kind.
|
||||||
|
//
|
||||||
|
// The given type wantTy can be any type constraint that cty's "convert" package
|
||||||
|
// would accept. In particular, this means that you can pass
|
||||||
|
// cty.List(cty.DynamicPseudoType) to mean "list of any single type", which
|
||||||
|
// will then cause cty to attempt to unify all of the element types when given
|
||||||
|
// a tuple.
|
||||||
|
func MakeToFunc(wantTy cty.Type) function.Function {
|
||||||
|
return function.New(&function.Spec{
|
||||||
|
Params: []function.Parameter{
|
||||||
|
{
|
||||||
|
Name: "v",
|
||||||
|
// We use DynamicPseudoType rather than wantTy here so that
|
||||||
|
// all values will pass through the function API verbatim and
|
||||||
|
// we can handle the conversion logic within the Type and
|
||||||
|
// Impl functions. This allows us to customize the error
|
||||||
|
// messages to be more appropriate for an explicit type
|
||||||
|
// conversion, whereas the cty function system produces
|
||||||
|
// messages aimed at _implicit_ type conversions.
|
||||||
|
Type: cty.DynamicPseudoType,
|
||||||
|
AllowNull: true,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
Type: func(args []cty.Value) (cty.Type, error) {
|
||||||
|
gotTy := args[0].Type()
|
||||||
|
if gotTy.Equals(wantTy) {
|
||||||
|
return wantTy, nil
|
||||||
|
}
|
||||||
|
conv := convert.GetConversionUnsafe(args[0].Type(), wantTy)
|
||||||
|
if conv == nil {
|
||||||
|
// We'll use some specialized errors for some trickier cases,
|
||||||
|
// but most we can handle in a simple way.
|
||||||
|
switch {
|
||||||
|
case gotTy.IsTupleType() && wantTy.IsTupleType():
|
||||||
|
return cty.NilType, function.NewArgErrorf(0, "incompatible tuple type for conversion: %s", convert.MismatchMessage(gotTy, wantTy))
|
||||||
|
case gotTy.IsObjectType() && wantTy.IsObjectType():
|
||||||
|
return cty.NilType, function.NewArgErrorf(0, "incompatible object type for conversion: %s", convert.MismatchMessage(gotTy, wantTy))
|
||||||
|
default:
|
||||||
|
return cty.NilType, function.NewArgErrorf(0, "cannot convert %s to %s", gotTy.FriendlyName(), wantTy.FriendlyNameForConstraint())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// If a conversion is available then everything is fine.
|
||||||
|
return wantTy, nil
|
||||||
|
},
|
||||||
|
Impl: func(args []cty.Value, retType cty.Type) (cty.Value, error) {
|
||||||
|
// We didn't set "AllowUnknown" on our argument, so it is guaranteed
|
||||||
|
// to be known here but may still be null.
|
||||||
|
ret, err := convert.Convert(args[0], retType)
|
||||||
|
if err != nil {
|
||||||
|
// Because we used GetConversionUnsafe above, conversion can
|
||||||
|
// still potentially fail in here. For example, if the user
|
||||||
|
// asks to convert the string "a" to bool then we'll
|
||||||
|
// optimistically permit it during type checking but fail here
|
||||||
|
// once we note that the value isn't either "true" or "false".
|
||||||
|
gotTy := args[0].Type()
|
||||||
|
switch {
|
||||||
|
case gotTy == cty.String && wantTy == cty.Bool:
|
||||||
|
what := "string"
|
||||||
|
if !args[0].IsNull() {
|
||||||
|
what = strconv.Quote(args[0].AsString())
|
||||||
|
}
|
||||||
|
return cty.NilVal, function.NewArgErrorf(0, `cannot convert %s to bool; only the strings "true" or "false" are allowed`, what)
|
||||||
|
case gotTy == cty.String && wantTy == cty.Number:
|
||||||
|
what := "string"
|
||||||
|
if !args[0].IsNull() {
|
||||||
|
what = strconv.Quote(args[0].AsString())
|
||||||
|
}
|
||||||
|
return cty.NilVal, function.NewArgErrorf(0, `cannot convert %s to number; given string must be a decimal representation of a number`, what)
|
||||||
|
default:
|
||||||
|
return cty.NilVal, function.NewArgErrorf(0, "cannot convert %s to %s", gotTy.FriendlyName(), wantTy.FriendlyNameForConstraint())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return ret, nil
|
||||||
|
},
|
||||||
|
})
|
||||||
|
}
|
@ -0,0 +1,80 @@
|
|||||||
|
package stdlib
|
||||||
|
|
||||||
|
import (
|
||||||
|
"regexp"
|
||||||
|
"strings"
|
||||||
|
|
||||||
|
"github.com/zclconf/go-cty/cty"
|
||||||
|
"github.com/zclconf/go-cty/cty/function"
|
||||||
|
)
|
||||||
|
|
||||||
|
// ReplaceFunc is a function that searches a given string for another given
|
||||||
|
// substring, and replaces each occurence with a given replacement string.
|
||||||
|
// The substr argument is a simple string.
|
||||||
|
var ReplaceFunc = function.New(&function.Spec{
|
||||||
|
Params: []function.Parameter{
|
||||||
|
{
|
||||||
|
Name: "str",
|
||||||
|
Type: cty.String,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Name: "substr",
|
||||||
|
Type: cty.String,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Name: "replace",
|
||||||
|
Type: cty.String,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
Type: function.StaticReturnType(cty.String),
|
||||||
|
Impl: func(args []cty.Value, retType cty.Type) (cty.Value, error) {
|
||||||
|
str := args[0].AsString()
|
||||||
|
substr := args[1].AsString()
|
||||||
|
replace := args[2].AsString()
|
||||||
|
|
||||||
|
return cty.StringVal(strings.Replace(str, substr, replace, -1)), nil
|
||||||
|
},
|
||||||
|
})
|
||||||
|
|
||||||
|
// RegexReplaceFunc is a function that searches a given string for another
|
||||||
|
// given substring, and replaces each occurence with a given replacement
|
||||||
|
// string. The substr argument must be a valid regular expression.
|
||||||
|
var RegexReplaceFunc = function.New(&function.Spec{
|
||||||
|
Params: []function.Parameter{
|
||||||
|
{
|
||||||
|
Name: "str",
|
||||||
|
Type: cty.String,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Name: "substr",
|
||||||
|
Type: cty.String,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Name: "replace",
|
||||||
|
Type: cty.String,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
Type: function.StaticReturnType(cty.String),
|
||||||
|
Impl: func(args []cty.Value, retType cty.Type) (ret cty.Value, err error) {
|
||||||
|
str := args[0].AsString()
|
||||||
|
substr := args[1].AsString()
|
||||||
|
replace := args[2].AsString()
|
||||||
|
|
||||||
|
re, err := regexp.Compile(substr)
|
||||||
|
if err != nil {
|
||||||
|
return cty.UnknownVal(cty.String), err
|
||||||
|
}
|
||||||
|
|
||||||
|
return cty.StringVal(re.ReplaceAllString(str, replace)), nil
|
||||||
|
},
|
||||||
|
})
|
||||||
|
|
||||||
|
// Replace searches a given string for another given substring,
|
||||||
|
// and replaces all occurrences with a given replacement string.
|
||||||
|
func Replace(str, substr, replace cty.Value) (cty.Value, error) {
|
||||||
|
return ReplaceFunc.Call([]cty.Value{str, substr, replace})
|
||||||
|
}
|
||||||
|
|
||||||
|
func RegexReplace(str, substr, replace cty.Value) (cty.Value, error) {
|
||||||
|
return RegexReplaceFunc.Call([]cty.Value{str, substr, replace})
|
||||||
|
}
|
Loading…
Reference in New Issue