Merge pull request #920 from crazy-max/update-gocty
bump github.com/zclconf/go-cty from 1.7.1 to 1.10.0pull/925/head
commit
b568b219bc
@ -0,0 +1,95 @@
|
||||
Copyright (c) 2017 Martin Atkins
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
||||
|
||||
---------
|
||||
|
||||
Unicode table generation programs are under a separate copyright and license:
|
||||
|
||||
Copyright (c) 2014 Couchbase, Inc.
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
|
||||
except in compliance with the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the
|
||||
License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
either express or implied. See the License for the specific language governing permissions
|
||||
and limitations under the License.
|
||||
|
||||
---------
|
||||
|
||||
Grapheme break data is provided as part of the Unicode character database,
|
||||
copright 2016 Unicode, Inc, which is provided with the following license:
|
||||
|
||||
Unicode Data Files include all data files under the directories
|
||||
http://www.unicode.org/Public/, http://www.unicode.org/reports/,
|
||||
http://www.unicode.org/cldr/data/, http://source.icu-project.org/repos/icu/, and
|
||||
http://www.unicode.org/utility/trac/browser/.
|
||||
|
||||
Unicode Data Files do not include PDF online code charts under the
|
||||
directory http://www.unicode.org/Public/.
|
||||
|
||||
Software includes any source code published in the Unicode Standard
|
||||
or under the directories
|
||||
http://www.unicode.org/Public/, http://www.unicode.org/reports/,
|
||||
http://www.unicode.org/cldr/data/, http://source.icu-project.org/repos/icu/, and
|
||||
http://www.unicode.org/utility/trac/browser/.
|
||||
|
||||
NOTICE TO USER: Carefully read the following legal agreement.
|
||||
BY DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING UNICODE INC.'S
|
||||
DATA FILES ("DATA FILES"), AND/OR SOFTWARE ("SOFTWARE"),
|
||||
YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE
|
||||
TERMS AND CONDITIONS OF THIS AGREEMENT.
|
||||
IF YOU DO NOT AGREE, DO NOT DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE
|
||||
THE DATA FILES OR SOFTWARE.
|
||||
|
||||
COPYRIGHT AND PERMISSION NOTICE
|
||||
|
||||
Copyright © 1991-2017 Unicode, Inc. All rights reserved.
|
||||
Distributed under the Terms of Use in http://www.unicode.org/copyright.html.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining
|
||||
a copy of the Unicode data files and any associated documentation
|
||||
(the "Data Files") or Unicode software and any associated documentation
|
||||
(the "Software") to deal in the Data Files or Software
|
||||
without restriction, including without limitation the rights to use,
|
||||
copy, modify, merge, publish, distribute, and/or sell copies of
|
||||
the Data Files or Software, and to permit persons to whom the Data Files
|
||||
or Software are furnished to do so, provided that either
|
||||
(a) this copyright and permission notice appear with all copies
|
||||
of the Data Files or Software, or
|
||||
(b) this copyright and permission notice appear in associated
|
||||
Documentation.
|
||||
|
||||
THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF
|
||||
ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
|
||||
WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
NONINFRINGEMENT OF THIRD PARTY RIGHTS.
|
||||
IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS
|
||||
NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL
|
||||
DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
|
||||
DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
|
||||
TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
|
||||
PERFORMANCE OF THE DATA FILES OR SOFTWARE.
|
||||
|
||||
Except as contained in this notice, the name of a copyright holder
|
||||
shall not be used in advertising or otherwise to promote the sale,
|
||||
use or other dealings in these Data Files or Software without prior
|
||||
written authorization of the copyright holder.
|
@ -0,0 +1,30 @@
|
||||
package textseg
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"bytes"
|
||||
)
|
||||
|
||||
// AllTokens is a utility that uses a bufio.SplitFunc to produce a slice of
|
||||
// all of the recognized tokens in the given buffer.
|
||||
func AllTokens(buf []byte, splitFunc bufio.SplitFunc) ([][]byte, error) {
|
||||
scanner := bufio.NewScanner(bytes.NewReader(buf))
|
||||
scanner.Split(splitFunc)
|
||||
var ret [][]byte
|
||||
for scanner.Scan() {
|
||||
ret = append(ret, scanner.Bytes())
|
||||
}
|
||||
return ret, scanner.Err()
|
||||
}
|
||||
|
||||
// TokenCount is a utility that uses a bufio.SplitFunc to count the number of
|
||||
// recognized tokens in the given buffer.
|
||||
func TokenCount(buf []byte, splitFunc bufio.SplitFunc) (int, error) {
|
||||
scanner := bufio.NewScanner(bytes.NewReader(buf))
|
||||
scanner.Split(splitFunc)
|
||||
var ret int
|
||||
for scanner.Scan() {
|
||||
ret++
|
||||
}
|
||||
return ret, scanner.Err()
|
||||
}
|
@ -0,0 +1,8 @@
|
||||
package textseg
|
||||
|
||||
//go:generate go run make_tables.go -output tables.go
|
||||
//go:generate go run make_test_tables.go -output tables_test.go
|
||||
//go:generate ruby unicode2ragel.rb --url=https://www.unicode.org/Public/13.0.0/ucd/auxiliary/GraphemeBreakProperty.txt -m GraphemeCluster -p "Prepend,CR,LF,Control,Extend,Regional_Indicator,SpacingMark,L,V,T,LV,LVT,ZWJ" -o grapheme_clusters_table.rl
|
||||
//go:generate ruby unicode2ragel.rb --url=https://www.unicode.org/Public/13.0.0/ucd/emoji/emoji-data.txt -m Emoji -p "Extended_Pictographic" -o emoji_table.rl
|
||||
//go:generate ragel -Z grapheme_clusters.rl
|
||||
//go:generate gofmt -w grapheme_clusters.go
|
4138
vendor/github.com/apparentlymart/go-textseg/v13/textseg/grapheme_clusters.go
generated
vendored
4138
vendor/github.com/apparentlymart/go-textseg/v13/textseg/grapheme_clusters.go
generated
vendored
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,133 @@
|
||||
package textseg
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"unicode/utf8"
|
||||
)
|
||||
|
||||
// Generated from grapheme_clusters.rl. DO NOT EDIT
|
||||
%%{
|
||||
# (except you are actually in grapheme_clusters.rl here, so edit away!)
|
||||
|
||||
machine graphclust;
|
||||
write data;
|
||||
}%%
|
||||
|
||||
var Error = errors.New("invalid UTF8 text")
|
||||
|
||||
// ScanGraphemeClusters is a split function for bufio.Scanner that splits
|
||||
// on grapheme cluster boundaries.
|
||||
func ScanGraphemeClusters(data []byte, atEOF bool) (int, []byte, error) {
|
||||
if len(data) == 0 {
|
||||
return 0, nil, nil
|
||||
}
|
||||
|
||||
// Ragel state
|
||||
cs := 0 // Current State
|
||||
p := 0 // "Pointer" into data
|
||||
pe := len(data) // End-of-data "pointer"
|
||||
ts := 0
|
||||
te := 0
|
||||
act := 0
|
||||
eof := pe
|
||||
|
||||
// Make Go compiler happy
|
||||
_ = ts
|
||||
_ = te
|
||||
_ = act
|
||||
_ = eof
|
||||
|
||||
startPos := 0
|
||||
endPos := 0
|
||||
|
||||
%%{
|
||||
include GraphemeCluster "grapheme_clusters_table.rl";
|
||||
include Emoji "emoji_table.rl";
|
||||
|
||||
action start {
|
||||
startPos = p
|
||||
}
|
||||
|
||||
action end {
|
||||
endPos = p
|
||||
}
|
||||
|
||||
action emit {
|
||||
return endPos+1, data[startPos:endPos+1], nil
|
||||
}
|
||||
|
||||
ZWJGlue = ZWJ (Extended_Pictographic Extend*)?;
|
||||
AnyExtender = Extend | ZWJGlue | SpacingMark;
|
||||
Extension = AnyExtender*;
|
||||
ReplacementChar = (0xEF 0xBF 0xBD);
|
||||
|
||||
CRLFSeq = CR LF;
|
||||
ControlSeq = Control | ReplacementChar;
|
||||
HangulSeq = (
|
||||
L+ (((LV? V+ | LVT) T*)?|LV?) |
|
||||
LV V* T* |
|
||||
V+ T* |
|
||||
LVT T* |
|
||||
T+
|
||||
) Extension;
|
||||
EmojiSeq = Extended_Pictographic Extend* Extension;
|
||||
ZWJSeq = ZWJ (ZWJ | Extend | SpacingMark)*;
|
||||
EmojiFlagSeq = Regional_Indicator Regional_Indicator? Extension;
|
||||
|
||||
UTF8Cont = 0x80 .. 0xBF;
|
||||
AnyUTF8 = (
|
||||
0x00..0x7F |
|
||||
0xC0..0xDF . UTF8Cont |
|
||||
0xE0..0xEF . UTF8Cont . UTF8Cont |
|
||||
0xF0..0xF7 . UTF8Cont . UTF8Cont . UTF8Cont
|
||||
);
|
||||
|
||||
# OtherSeq is any character that isn't at the start of one of the extended sequences above, followed by extension
|
||||
OtherSeq = (AnyUTF8 - (CR|LF|Control|ReplacementChar|L|LV|V|LVT|T|Extended_Pictographic|ZWJ|Regional_Indicator|Prepend)) (Extend | ZWJ | SpacingMark)*;
|
||||
|
||||
# PrependSeq is prepend followed by any of the other patterns above, except control characters which explicitly break
|
||||
PrependSeq = Prepend+ (HangulSeq|EmojiSeq|ZWJSeq|EmojiFlagSeq|OtherSeq)?;
|
||||
|
||||
CRLFTok = CRLFSeq >start @end;
|
||||
ControlTok = ControlSeq >start @end;
|
||||
HangulTok = HangulSeq >start @end;
|
||||
EmojiTok = EmojiSeq >start @end;
|
||||
ZWJTok = ZWJSeq >start @end;
|
||||
EmojiFlagTok = EmojiFlagSeq >start @end;
|
||||
OtherTok = OtherSeq >start @end;
|
||||
PrependTok = PrependSeq >start @end;
|
||||
|
||||
main := |*
|
||||
CRLFTok => emit;
|
||||
ControlTok => emit;
|
||||
HangulTok => emit;
|
||||
EmojiTok => emit;
|
||||
ZWJTok => emit;
|
||||
EmojiFlagTok => emit;
|
||||
PrependTok => emit;
|
||||
OtherTok => emit;
|
||||
|
||||
# any single valid UTF-8 character would also be valid per spec,
|
||||
# but we'll handle that separately after the loop so we can deal
|
||||
# with requesting more bytes if we're not at EOF.
|
||||
*|;
|
||||
|
||||
write init;
|
||||
write exec;
|
||||
}%%
|
||||
|
||||
// If we fall out here then we were unable to complete a sequence.
|
||||
// If we weren't able to complete a sequence then either we've
|
||||
// reached the end of a partial buffer (so there's more data to come)
|
||||
// or we have an isolated symbol that would normally be part of a
|
||||
// grapheme cluster but has appeared in isolation here.
|
||||
|
||||
if !atEOF {
|
||||
// Request more
|
||||
return 0, nil, nil
|
||||
}
|
||||
|
||||
// Just take the first UTF-8 sequence and return that.
|
||||
_, seqLen := utf8.DecodeRune(data)
|
||||
return seqLen, data[:seqLen], nil
|
||||
}
|
1609
vendor/github.com/apparentlymart/go-textseg/v13/textseg/grapheme_clusters_table.rl
generated
vendored
1609
vendor/github.com/apparentlymart/go-textseg/v13/textseg/grapheme_clusters_table.rl
generated
vendored
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,335 @@
|
||||
#!/usr/bin/env ruby
|
||||
#
|
||||
# This scripted has been updated to accept more command-line arguments:
|
||||
#
|
||||
# -u, --url URL to process
|
||||
# -m, --machine Machine name
|
||||
# -p, --properties Properties to add to the machine
|
||||
# -o, --output Write output to file
|
||||
#
|
||||
# Updated by: Marty Schoch <marty.schoch@gmail.com>
|
||||
#
|
||||
# This script uses the unicode spec to generate a Ragel state machine
|
||||
# that recognizes unicode alphanumeric characters. It generates 5
|
||||
# character classes: uupper, ulower, ualpha, udigit, and ualnum.
|
||||
# Currently supported encodings are UTF-8 [default] and UCS-4.
|
||||
#
|
||||
# Usage: unicode2ragel.rb [options]
|
||||
# -e, --encoding [ucs4 | utf8] Data encoding
|
||||
# -h, --help Show this message
|
||||
#
|
||||
# This script was originally written as part of the Ferret search
|
||||
# engine library.
|
||||
#
|
||||
# Author: Rakan El-Khalil <rakan@well.com>
|
||||
|
||||
require 'optparse'
|
||||
require 'open-uri'
|
||||
|
||||
ENCODINGS = [ :utf8, :ucs4 ]
|
||||
ALPHTYPES = { :utf8 => "byte", :ucs4 => "rune" }
|
||||
DEFAULT_CHART_URL = "http://www.unicode.org/Public/5.1.0/ucd/DerivedCoreProperties.txt"
|
||||
DEFAULT_MACHINE_NAME= "WChar"
|
||||
|
||||
###
|
||||
# Display vars & default option
|
||||
|
||||
TOTAL_WIDTH = 80
|
||||
RANGE_WIDTH = 23
|
||||
@encoding = :utf8
|
||||
@chart_url = DEFAULT_CHART_URL
|
||||
machine_name = DEFAULT_MACHINE_NAME
|
||||
properties = []
|
||||
@output = $stdout
|
||||
|
||||
###
|
||||
# Option parsing
|
||||
|
||||
cli_opts = OptionParser.new do |opts|
|
||||
opts.on("-e", "--encoding [ucs4 | utf8]", "Data encoding") do |o|
|
||||
@encoding = o.downcase.to_sym
|
||||
end
|
||||
opts.on("-h", "--help", "Show this message") do
|
||||
puts opts
|
||||
exit
|
||||
end
|
||||
opts.on("-u", "--url URL", "URL to process") do |o|
|
||||
@chart_url = o
|
||||
end
|
||||
opts.on("-m", "--machine MACHINE_NAME", "Machine name") do |o|
|
||||
machine_name = o
|
||||
end
|
||||
opts.on("-p", "--properties x,y,z", Array, "Properties to add to machine") do |o|
|
||||
properties = o
|
||||
end
|
||||
opts.on("-o", "--output FILE", "output file") do |o|
|
||||
@output = File.new(o, "w+")
|
||||
end
|
||||
end
|
||||
|
||||
cli_opts.parse(ARGV)
|
||||
unless ENCODINGS.member? @encoding
|
||||
puts "Invalid encoding: #{@encoding}"
|
||||
puts cli_opts
|
||||
exit
|
||||
end
|
||||
|
||||
##
|
||||
# Downloads the document at url and yields every alpha line's hex
|
||||
# range and description.
|
||||
|
||||
def each_alpha( url, property )
|
||||
URI.open( url ) do |file|
|
||||
file.each_line do |line|
|
||||
next if line =~ /^#/;
|
||||
next if line !~ /; #{property} *#/;
|
||||
|
||||
range, description = line.split(/;/)
|
||||
range.strip!
|
||||
description.gsub!(/.*#/, '').strip!
|
||||
|
||||
if range =~ /\.\./
|
||||
start, stop = range.split '..'
|
||||
else start = stop = range
|
||||
end
|
||||
|
||||
yield start.hex .. stop.hex, description
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
###
|
||||
# Formats to hex at minimum width
|
||||
|
||||
def to_hex( n )
|
||||
r = "%0X" % n
|
||||
r = "0#{r}" unless (r.length % 2).zero?
|
||||
r
|
||||
end
|
||||
|
||||
###
|
||||
# UCS4 is just a straight hex conversion of the unicode codepoint.
|
||||
|
||||
def to_ucs4( range )
|
||||
rangestr = "0x" + to_hex(range.begin)
|
||||
rangestr << "..0x" + to_hex(range.end) if range.begin != range.end
|
||||
[ rangestr ]
|
||||
end
|
||||
|
||||
##
|
||||
# 0x00 - 0x7f -> 0zzzzzzz[7]
|
||||
# 0x80 - 0x7ff -> 110yyyyy[5] 10zzzzzz[6]
|
||||
# 0x800 - 0xffff -> 1110xxxx[4] 10yyyyyy[6] 10zzzzzz[6]
|
||||
# 0x010000 - 0x10ffff -> 11110www[3] 10xxxxxx[6] 10yyyyyy[6] 10zzzzzz[6]
|
||||
|
||||
UTF8_BOUNDARIES = [0x7f, 0x7ff, 0xffff, 0x10ffff]
|
||||
|
||||
def to_utf8_enc( n )
|
||||
r = 0
|
||||
if n <= 0x7f
|
||||
r = n
|
||||
elsif n <= 0x7ff
|
||||
y = 0xc0 | (n >> 6)
|
||||
z = 0x80 | (n & 0x3f)
|
||||
r = y << 8 | z
|
||||
elsif n <= 0xffff
|
||||
x = 0xe0 | (n >> 12)
|
||||
y = 0x80 | (n >> 6) & 0x3f
|
||||
z = 0x80 | n & 0x3f
|
||||
r = x << 16 | y << 8 | z
|
||||
elsif n <= 0x10ffff
|
||||
w = 0xf0 | (n >> 18)
|
||||
x = 0x80 | (n >> 12) & 0x3f
|
||||
y = 0x80 | (n >> 6) & 0x3f
|
||||
z = 0x80 | n & 0x3f
|
||||
r = w << 24 | x << 16 | y << 8 | z
|
||||
end
|
||||
|
||||
to_hex(r)
|
||||
end
|
||||
|
||||
def from_utf8_enc( n )
|
||||
n = n.hex
|
||||
r = 0
|
||||
if n <= 0x7f
|
||||
r = n
|
||||
elsif n <= 0xdfff
|
||||
y = (n >> 8) & 0x1f
|
||||
z = n & 0x3f
|
||||
r = y << 6 | z
|
||||
elsif n <= 0xefffff
|
||||
x = (n >> 16) & 0x0f
|
||||
y = (n >> 8) & 0x3f
|
||||
z = n & 0x3f
|
||||
r = x << 10 | y << 6 | z
|
||||
elsif n <= 0xf7ffffff
|
||||
w = (n >> 24) & 0x07
|
||||
x = (n >> 16) & 0x3f
|
||||
y = (n >> 8) & 0x3f
|
||||
z = n & 0x3f
|
||||
r = w << 18 | x << 12 | y << 6 | z
|
||||
end
|
||||
r
|
||||
end
|
||||
|
||||
###
|
||||
# Given a range, splits it up into ranges that can be continuously
|
||||
# encoded into utf8. Eg: 0x00 .. 0xff => [0x00..0x7f, 0x80..0xff]
|
||||
# This is not strictly needed since the current [5.1] unicode standard
|
||||
# doesn't have ranges that straddle utf8 boundaries. This is included
|
||||
# for completeness as there is no telling if that will ever change.
|
||||
|
||||
def utf8_ranges( range )
|
||||
ranges = []
|
||||
UTF8_BOUNDARIES.each do |max|
|
||||
if range.begin <= max
|
||||
if range.end <= max
|
||||
ranges << range
|
||||
return ranges
|
||||
end
|
||||
|
||||
ranges << (range.begin .. max)
|
||||
range = (max + 1) .. range.end
|
||||
end
|
||||
end
|
||||
ranges
|
||||
end
|
||||
|
||||
def build_range( start, stop )
|
||||
size = start.size/2
|
||||
left = size - 1
|
||||
return [""] if size < 1
|
||||
|
||||
a = start[0..1]
|
||||
b = stop[0..1]
|
||||
|
||||
###
|
||||
# Shared prefix
|
||||
|
||||
if a == b
|
||||
return build_range(start[2..-1], stop[2..-1]).map do |elt|
|
||||
"0x#{a} " + elt
|
||||
end
|
||||
end
|
||||
|
||||
###
|
||||
# Unshared prefix, end of run
|
||||
|
||||
return ["0x#{a}..0x#{b} "] if left.zero?
|
||||
|
||||
###
|
||||
# Unshared prefix, not end of run
|
||||
# Range can be 0x123456..0x56789A
|
||||
# Which is equivalent to:
|
||||
# 0x123456 .. 0x12FFFF
|
||||
# 0x130000 .. 0x55FFFF
|
||||
# 0x560000 .. 0x56789A
|
||||
|
||||
ret = []
|
||||
ret << build_range(start, a + "FF" * left)
|
||||
|
||||
###
|
||||
# Only generate middle range if need be.
|
||||
|
||||
if a.hex+1 != b.hex
|
||||
max = to_hex(b.hex - 1)
|
||||
max = "FF" if b == "FF"
|
||||
ret << "0x#{to_hex(a.hex+1)}..0x#{max} " + "0x00..0xFF " * left
|
||||
end
|
||||
|
||||
###
|
||||
# Don't generate last range if it is covered by first range
|
||||
|
||||
ret << build_range(b + "00" * left, stop) unless b == "FF"
|
||||
ret.flatten!
|
||||
end
|
||||
|
||||
def to_utf8( range )
|
||||
utf8_ranges( range ).map do |r|
|
||||
begin_enc = to_utf8_enc(r.begin)
|
||||
end_enc = to_utf8_enc(r.end)
|
||||
build_range begin_enc, end_enc
|
||||
end.flatten!
|
||||
end
|
||||
|
||||
##
|
||||
# Perform a 3-way comparison of the number of codepoints advertised by
|
||||
# the unicode spec for the given range, the originally parsed range,
|
||||
# and the resulting utf8 encoded range.
|
||||
|
||||
def count_codepoints( code )
|
||||
code.split(' ').inject(1) do |acc, elt|
|
||||
if elt =~ /0x(.+)\.\.0x(.+)/
|
||||
if @encoding == :utf8
|
||||
acc * (from_utf8_enc($2) - from_utf8_enc($1) + 1)
|
||||
else
|
||||
acc * ($2.hex - $1.hex + 1)
|
||||
end
|
||||
else
|
||||
acc
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
def is_valid?( range, desc, codes )
|
||||
spec_count = 1
|
||||
spec_count = $1.to_i if desc =~ /\[(\d+)\]/
|
||||
range_count = range.end - range.begin + 1
|
||||
|
||||
sum = codes.inject(0) { |acc, elt| acc + count_codepoints(elt) }
|
||||
sum == spec_count and sum == range_count
|
||||
end
|
||||
|
||||
##
|
||||
# Generate the state maching to stdout
|
||||
|
||||
def generate_machine( name, property )
|
||||
pipe = " "
|
||||
@output.puts " #{name} = "
|
||||
each_alpha( @chart_url, property ) do |range, desc|
|
||||
|
||||
codes = (@encoding == :ucs4) ? to_ucs4(range) : to_utf8(range)
|
||||
|
||||
#raise "Invalid encoding of range #{range}: #{codes.inspect}" unless
|
||||
# is_valid? range, desc, codes
|
||||
|
||||
range_width = codes.map { |a| a.size }.max
|
||||
range_width = RANGE_WIDTH if range_width < RANGE_WIDTH
|
||||
|
||||
desc_width = TOTAL_WIDTH - RANGE_WIDTH - 11
|
||||
desc_width -= (range_width - RANGE_WIDTH) if range_width > RANGE_WIDTH
|
||||
|
||||
if desc.size > desc_width
|
||||
desc = desc[0..desc_width - 4] + "..."
|
||||
end
|
||||
|
||||
codes.each_with_index do |r, idx|
|
||||
desc = "" unless idx.zero?
|
||||
code = "%-#{range_width}s" % r
|
||||
@output.puts " #{pipe} #{code} ##{desc}"
|
||||
pipe = "|"
|
||||
end
|
||||
end
|
||||
@output.puts " ;"
|
||||
@output.puts ""
|
||||
end
|
||||
|
||||
@output.puts <<EOF
|
||||
# The following Ragel file was autogenerated with #{$0}
|
||||
# from: #{@chart_url}
|
||||
#
|
||||
# It defines #{properties}.
|
||||
#
|
||||
# To use this, make sure that your alphtype is set to #{ALPHTYPES[@encoding]},
|
||||
# and that your input is in #{@encoding}.
|
||||
|
||||
%%{
|
||||
machine #{machine_name};
|
||||
|
||||
EOF
|
||||
|
||||
properties.each { |x| generate_machine( x, x ) }
|
||||
|
||||
@output.puts <<EOF
|
||||
}%%
|
||||
EOF
|
@ -0,0 +1,19 @@
|
||||
package textseg
|
||||
|
||||
import "unicode/utf8"
|
||||
|
||||
// ScanGraphemeClusters is a split function for bufio.Scanner that splits
|
||||
// on UTF8 sequence boundaries.
|
||||
//
|
||||
// This is included largely for completeness, since this behavior is already
|
||||
// built in to Go when ranging over a string.
|
||||
func ScanUTF8Sequences(data []byte, atEOF bool) (int, []byte, error) {
|
||||
if len(data) == 0 {
|
||||
return 0, nil, nil
|
||||
}
|
||||
r, seqLen := utf8.DecodeRune(data)
|
||||
if r == utf8.RuneError && !atEOF {
|
||||
return 0, nil, nil
|
||||
}
|
||||
return seqLen, data[:seqLen], nil
|
||||
}
|
Loading…
Reference in New Issue