POD2::JA::Unicode::LineBreak(3) User Contributed Perl Documentation NAME Unicode::LineBreak~[ja] - UAX #14 Unicode SYNOPSIS use Unicode::LineBreak; $lb = Unicode::LineBreak->new(); $broken = $lb->break($string); DESCRIPTION Unicode::LineBreak Unicode 14 [UAX #14] Unicode 11 [UAX #11] East_Asian_Width mandatory break [UAX #14] direct breakindirect break alphabetic characters ideographic characters [UAX #14] AL ID () widenarrownonspacing 2 1 0 PUBLIC INTERFACE new ([KEY => VALUE, ...]) KEY => VALUE "" break (STRING) Unicode STRING break_partial (STRING) break() STRING "undef" config (KEY) config (KEY => VALUE, ...) KEY => VALUE "" copy breakingRule (BEFORESTR, AFTERSTR) BEFORESTR AFTERSTR "" : break() context ([Charset => CHARSET], [Language => LANGUAGE]) CHARSET LANGUAGE / "new""config" ([E]) ([G]) (Unicode::GCString~[ja] ) ([L]) BreakIndent => "YES" | "NO" [L] SPACE () [UAX #14] SPACE "YES" : 1.011 CharMax => NUMBER [L] 998 0 ColMin => NUMBER [L] 0 ColMax => NUMBER [L] 76 "Urgent" "" ComplexBreaking => "YES" | "NO" [L] "YES" Context => CONTEXT [E][L] / "EASTASIAN" "NONEASTASIAN" "NONEASTASIAN" "EASTASIAN" East_Asian_Width (A) AI (ID) "NONEASTASIAN" East_Asian_Width (A) AI (AL) EAWidth => "[" ORD "=>" PROPERTY "]" EAWidth => "undef" [E] East_Asian_Width ORD UCS PROPERTY East_Asian_Width ("" ) "undef" East_Asian_width "" Format => METHOD [L] "SIMPLE" "NEWLINE" "Newline" "TRIM" "undef" () "" HangulAsAL => "YES" | "NO" [L] conjoining jamo (AL) "NO" LBClass => "[" ORD "=>" CLASS "]" LBClass => "undef" [G][L] () ORD UCS CLASS ("" ) "undef" "" LegacyCM => "YES" | "NO" [G][L] (ID) Unicode 5.0 "YES" Newline => STRING [L] Unicode "\n" Prep => METHOD [L] METHOD "NONBREAKURI" URI "BREAKURI" URI [CMOS] 6.17 17.11 "[" REGEX, SUBREF "]" REGEX SUBREF "" "undef" Sizing => METHOD [L] "UAX11" "undef" (Unicode::GCString ) "" "ColMax""ColMin""EAWidth" Urgent => METHOD [L] "CROAK" "FORCE" "undef" "" ViramaAsJoiner => "YES" | "NO" [G] () "YES" : 2011.001_29 "NO" [UAX #29] "EA_Na", "EA_N", "EA_A", "EA_W", "EA_H", "EA_F" [UAX #11] 6 East_Asian_Width (Na) (N) (A) (W) (H) (F) "EA_Z" East_Asian_Width : [UAX #11] "LB_BK", "LB_CR", "LB_LF", "LB_NL", "LB_SP", "LB_OP", "LB_CL", "LB_CP", "LB_QU", "LB_GL", "LB_NS", "LB_EX", "LB_SY", "LB_IS", "LB_PR", "LB_PO", "LB_NU", "LB_AL", "LB_HL", "LB_ID", "LB_IN", "LB_HY", "LB_BA", "LB_BB", "LB_B2", "LB_CB", "LB_ZW", "LB_CM", "LB_WJ", "LB_H2", "LB_H3", "LB_JL", "LB_JV", "LB_JT", "LB_SG", "LB_AI", "LB_CJ", "LB_SA", "LB_XX", "LB_RI" [UAX #14] 40 () : CP Unicode 5.2.0 HL CJ Unicode 6.1.0 RI Unicode 6.2.0 "MANDATORY", "DIRECT", "INDIRECT", "PROHIBITED" 4 "Unicode::LineBreak::SouthEastAsian::supported" "undef" : "UNICODE_VERSION" Unicode CUSTOMIZATION "Format" 3 $ = &(SELF, EVENT, STR); SELF Unicode::LineBreak EVENT STR Unicode EVENT | |STR ----------------------------------------------------------------- "sot" | | "sop" | | "sol" | | "" | | () "eol" | | "eop" | | "eot" | | () ----------------------------------------------------------------- "undef" "sot""sop""sol" : Unicode::GCString~[ja] sub fmt { if ($_[1] =~ /^eo/) { return "\n"; } return undef; } my $lb = Unicode::LineBreak->new(Format => \&fmt); $output = $lb->break($text); CharMaxColMaxColMin "Urgent" 2 @ = &(SELF, STR); SELF Unicode::LineBreak STR Unicode STR : Unicode::GCString~[ja] () sub hyphenize { return map {$_ =~ s/yl$/yl-/; $_} split /(\w+?yl(?=\w))/, $_[1]; } my $lb = Unicode::LineBreak->new(Urgent => \&hyphenize); $output = $lb->break("Methionylthreonylthreonylglutaminylarginyl..."); "Prep" [REGEX, SUBREF] 2 @ = &(SELF, STR); SELF Unicode::LineBreak STR REGEX Unicode STR HTTP URL [CMOS] my $url = qr{http://[\x21-\x7E]+}i; sub breakurl { my $self = shift; my $str = shift; return split m{(?<=[/]) (?=[^/]) | (?<=[^-.]) (?=[-~.,_?\#%=&]) | (?<=[=&]) (?=.)}x, $str; } my $lb = Unicode::LineBreak->new(Prep => [$url, \&breakurl]); $output = $lb->break($string); Unicode::LineBreak sub paraformat { my $self = shift; my $action = shift; my $str = shift; if ($action eq 'sot' or $action eq 'sop') { $self->{'line'} = ''; } elsif ($action eq '') { $self->{'line'} = $str; } elsif ($action eq 'eol') { return "\n"; } elsif ($action eq 'eop') { if (length $self->{'line'}) { return "\n\n"; } else { return "\n"; } } elsif ($action eq 'eot') { return "\n"; } return undef; } my $lb = Unicode::LineBreak->new(Format => \¶format); $output = $lb->break($string); "Sizing" 5 $ = &(SELF, LEN, PRE, SPC, STR); SELF Unicode::LineBreak LEN PRE Unicode SPC STR Unicode "PRE.SPC.STR" "ColMin" "ColMax" : Unicode::GCString~[ja] 8 sub tabbedsizing { my ($self, $cols, $pre, $spc, $str) = @_; my $spcstr = $spc.$str; while ($spcstr->lbc == LB_SP) { my $c = $spcstr->item(0); if ($c eq "\t") { $cols += 8 - $cols % 8; } else { $cols += $c->columns; } $spcstr = $spcstr->substr(1); } $cols += $spcstr->columns; return $cols; }; my $lb = Unicode::LineBreak->new(LBClass => [ord("\t") => LB_SP], Sizing => \&tabbedsizing); $output = $lb->break($string); "LBClass" "EAWidth" () East_Asian_Width (NS CJ) LBClass (ID) "KANA_NONSTARTERS() => LB_ID" "IDEOGRAPHIC_ITERATION_MARKS() => LB_ID" U+3005 U+303B U+309D U+309E ()U+30FD U+30FE () "KANA_SMALL_LETTERS() => LB_ID" "KANA_PROLONGED_SOUND_MARKS() => LB_ID" U+3041 , U+3043 , U+3045 , U+3047 , U+3049 , U+3063 , U+3083 , U+3085 , U+3087 , U+308E , U+3095 , U+3096 U+30A1 , U+30A3 , U+30A5 , U+30A7 , U+30A9 , U+30C3 , U+30E3 , U+30E5 , U+30E7 , U+30EE , U+30F5 , U+30F6 U+31F0 - U+31FF () U+FF67 - U+FF6F U+30FC U+FF70 () [JIS X 4051] 6.1.1[JLREQ] 3.1.7 [UAX14] U+3095 , U+3096 , U+30F5 , U+30F6 "MASU_MARK() => LB_ID" U+303C "" "" [UAX #14] (NS) [JIS X 4051] [JLREQ] (13) cl-19 (ID ) (QU) "BACKWARD_QUOTES() => LB_OP, FORWARD_QUOTES() => LB_CL" ( ) 9 (` ") 9 (' ") "FORWARD_QUOTES() => LB_OP, BACKWARD_QUOTES() => LB_CL" () 9 (' ") 9 (` ") "BACKWARD_GUILLEMETS() => LB_OP, FORWARD_GUILLEMETS() => LB_CL" (<< <) (>> >) "FORWARD_GUILLEMETS() => LB_OP, BACKWARD_GUILLEMETS() => LB_CL" (>> >) (<< <) 9 (' " >> >) "IDEOGRAPHIC_SPACE() => LB_BA" U+3000 "IDEOGRAPHIC_SPACE() => LB_ID" Unicode 6.2 "IDEOGRAPHIC_SPACE() => LB_SP" East_Asian_Width (A) East_Asian_Width "EASTASIAN" "EAWidth => [ AMBIGUOUS_"*"() => EA_N ]" "AMBIGUOUS_ALPHABETICS() => EA_N" East_Asian_Width N () "AMBIGUOUS_CYRILLIC() => EA_N" "AMBIGUOUS_GREEK() => EA_N" "AMBIGUOUS_LATIN() => EA_N" (A) (N) Unicode (F) (Na) EAWidth "EASTASIAN" "QUESTIONABLE_NARROW_SIGNS() => EA_A" U+00A2 U+00A3 U+00A5 ()U+00A6 U+00AC U+00AF "new" "config" Unicode/LineBreak/Defaults.pm Unicode/LineBreak/Defaults.pm.sample BUGS CPAN Request Tracker: . VERSION $VERSION 2012.06 o eawidth() "columns" in Unicode::GCString o lbclass() "lbc" in Unicode::GCString "lbcext" in Unicode::GCString Unicode 8.0.0 UAX14-C2 IMPLEMENTATION NOTES o NS ID o ID AL o AI AL ID o CB o CJ NS o SA AL Grapheme_Cluster_Break Extend SpacingMark CM o SG XX AL o UCS | UAX #14 | UAX #11 | ------------------------------------------------------------- U+20A0..U+20CF | PR [*1] | N [*2] | U+3400..U+4DBF | ID | W | CJK U+4E00..U+9FFF | ID | W | CJK U+D800..U+DFFF | AL (SG) | N | U+E000..U+F8FF | AL (XX) | F N (A) | U+F900..U+FAFF | ID | W | CJK U+20000..U+2FFFD | ID | W | CJK U+30000..U+3FFFD | ID | W | U+F0000..U+FFFFD | AL (XX) | F N (A) | U+100000..U+10FFFD | AL (XX) | F N (A) | | AL (XX) | N | | | | ------------------------------------------------------------- [*1] U+20A7 (PO)U+20B6 (PO)U+20BB (PO)U+20BE (PO) [*2] U+20A9 (H)U+20AC (F N (A)) o MnMeCcCfZlZp REFERENCES [CMOS] The Chicago Manual of Style, 15th edition. University of Chicago Press, 2003. [JIS X 4051] JIS X 4051:2004 . , 2004. [JLREQ] . , W3C 201243. . [UAX #11] A. Freytag (ed.) (2008-2009). Unicode Standard Annex #11: East Asian Width, Revisions 17-19. . [UAX #14] A. Freytag and A. Heninger (eds.) (2008-2015). Unicode Standard Annex #14: Unicode Line Breaking Algorithm, Revisions 22-35. . [UAX #29] Mark Davis (ed.) (2009-2013). Unicode Standard Annex #29: Unicode Text Segmentation, Revisions 15-23. . SEE ALSO Text::LineFold~[ja], Text::Wrap, Unicode::GCString~[ja]. AUTHOR Copyright (C) 2009-2018 Hatuka*nezumi - IKEDA Soji . This program is free software; you can redistribute it and/or modify it under the same terms as Perl itself. perl v5.38.0 2023-07-25 POD2::JA::Unicode::LineBreak(3)