ASPN ActiveState Programmer Network
ActiveState
/ Home / Perl / PHP / Python / Tcl / XSLT /
/ Safari / My ASPN /
Cookbooks | Documentation | Mailing Lists | Modules | News Feeds | Products | User Groups


Recent Messages
List Archives
About the List
List Leaders
Subscription Options

View Subscriptions
Help

View by Topic
ActiveState
.NET Framework
Open Source
Perl
PHP
Python
Tcl
Web Services
XML & XSLT

View by Category
Database
General
SOAP
System Administration
Tools
User Interfaces
Web Programming
XML Programming


MyASPN >> Mail Archive >> perl5-porters
perl5-porters
[PATCH @11446] UnicodeCD::charinfo
by SADAHIRO Tomoyuki other posts by this author
Jul 23 2001 4:51PM messages near this date
Re: [PATCH bleadperl os2/perlrexx.c] my C compiler has no HTML parser | Re: [PATCH @11446] UnicodeCD::charinfo
Hello, this is a patch for /UnicodeCD\.(?:pm|t)/.

Since Unicode.txt is not sorted in dictionary order,
 e.g
  FFFD;REPLACEMENT CHARACTER;So;0;ON;;;;;N;;;;;
  10300;OLD ITALIC LETTER A;Lo;0;L;;;;;N;;;;;

then, a sorted file is necessary, isn't it?

  !lib/UnicodeCD.pm
  !lib/UnicodeCD.t
  +Unicode.sort

(but Unicode.sort is not attached,
 considering its hugeness in size 
 and easiness to prepare from Unicode.txt)

##### BEGIN PATCH #####
diff -urN orig/lib/UnicodeCD.pm lib/UnicodeCD.pm
--- orig/lib/UnicodeCD.pm	Sun Jul 22 08:02:50 2001
+++ lib/UnicodeCD.pm	Tue Jul 24 00:11:02 2001
@@ -119,14 +119,129 @@
     return;
 }
 
+sub han_charname {
+    my $arg  = shift;
+    my $code = _getcode($arg);
+    croak __PACKAGE__, "::charinfo: unknown code '$arg'"
+	unless defined $code;
+    croak __PACKAGE__, "::han_charname: outside CJK Unified Ideographs '$arg'"
+        unless 0x3400  <= $code && $code <= 0x4DB5  
+            || 0x4E00  <= $code && $code <= 0x9FA5  
+            || 0x20000 <= $code && $code <= 0x2A6D6;
+    sprintf "CJK UNIFIED IDEOGRAPH-%04X", $code;
+}
+
+my @JamoL = ( # Leading Consonant (HANGUL CHOSEONG)
+    "G", "GG", "N", "D", "DD", "R", "M", "B", "BB",
+    "S", "SS", "", "J", "JJ", "C", "K", "T", "P", "H",
+  );
+
+my @JamoV = ( # Medium Vowel (HANGUL JUNGSEONG)
+    "A", "AE", "YA", "YAE", "EO", "E", "YEO", "YE", "O",
+    "WA", "WAE", "OE", "YO", "U", "WEO", "WE", "WI",
+    "YU", "EU", "YI", "I",
+  );
+
+my @JamoT = ( # Trailing Consonant (HANGUL JONGSEONG)
+    "", "G", "GG", "GS", "N", "NJ", "NH", "D", "L", "LG", "LM",
+    "LB", "LS", "LT", "LP", "LH", "M", "B", "BS",
+    "S", "SS", "NG", "J", "C", "K", "T", "P", "H",
+  );
+
+my %HangulConst = (
+   SBase  =>  0xAC00,
+   LBase  =>  0x1100,
+   VBase  =>  0x1161,
+   TBase  =>  0x11A7,
+   LCount =>  19,     # scalar @JamoL
+   VCount =>  21,     # scalar @JamoV
+   TCount =>  28,     # scalar @JamoT
+   NCount =>  588,    # VCount * TCount
+   SCount =>  11172,  # LCount * NCount
+   Final  =>  0xD7A3, # SBase -1 + SCount
+  );
+
+sub hangul_charname {
+    my $arg  = shift;
+    my $code = _getcode($arg);
+    croak __PACKAGE__, "::charinfo: unknown code '$arg'"
+	unless defined $code;
+    croak __PACKAGE__, "::hangul_charname: outside Hangul Syllables '$arg'"
+        unless $HangulConst{SBase} <= $code && $code <= $HangulConst{Final};
+    my $SIndex = $code - $HangulConst{SBase};
+    my $LIndex = int( $SIndex / $HangulConst{NCount});
+    my $VIndex = int(($SIndex % $HangulConst{NCount}) / $HangulConst{TCount});
+    my $TIndex =      $SIndex % $HangulConst{TCount};
+    return join('',
+        "HANGUL SYLLABLE ",
+        $JamoL[$LIndex],
+        $JamoV[$VIndex],
+        $JamoT[$TIndex],
+      );
+}
+
+sub hangul_decomp {
+    my $arg  = shift;
+    my $code = _getcode($arg);
+    croak __PACKAGE__, "::charinfo: unknown code '$arg'"
+	unless defined $code;
+    croak __PACKAGE__, "::hangul_decomp: outside Hangul Syllables '$arg'"
+        unless $HangulConst{SBase} <= $code && $code <= $HangulConst{Final};
+    my $SIndex = $code - $HangulConst{SBase};
+    my $LIndex = int( $SIndex / $HangulConst{NCount});
+    my $VIndex = int(($SIndex % $HangulConst{NCount}) / $HangulConst{TCount});
+    my $TIndex =      $SIndex % $HangulConst{TCount};
+
+    return join(" ",
+        sprintf("%04X", $HangulConst{LBase} + $LIndex),
+        sprintf("%04X", $HangulConst{VBase} + $VIndex),
+      $TIndex ?
+        sprintf("%04X", $HangulConst{TBase} + $TIndex) : (),
+    );
+}
+
+my @CharinfoRanges = (
+# block name
+# [ first, last, coderef to name, coderef to decompose ],
+# CJK Ideographs Extension A
+  [ 0x3400,   0x4DB5,   \&han_charname,   undef  ],
+# CJK Ideographs
+  [ 0x4E00,   0x9FA5,   \&han_charname,   undef  ],
+# Hangul Syllables
+  [ 0xAC00,   0xD7A3,   \&hangul_charname, \&hangul_decomp  ],
+# Non-Private Use High Surrogates
+  [ 0xD800,   0xDB7F,   undef,   undef  ],
+# Private Use High Surrogates
+  [ 0xDB80,   0xDBFF,   undef,   undef  ],
+# Low Surrogates
+  [ 0xDC00,   0xDFFF,   undef,   undef  ],
+# The Private Use Area
+  [ 0xE000,   0xF8FF,   undef,   undef  ],
+# CJK Ideographs Extension B
+  [ 0x20000,  0x2A6D6,  \&han_charname,   undef  ],
+# Plane 15 Private Use Area
+  [ 0xF0000,  0xFFFFD,  undef,   undef  ],
+# Plane 16 Private Use Area
+  [ 0x100000, 0x10FFFD, undef,   undef  ],
+);
+
 sub charinfo {
     my $arg  = shift;
     my $code = _getcode($arg);
     croak __PACKAGE__, "::charinfo: unknown code '$arg'"
 	unless defined $code;
     my $hexk = sprintf("%04X", $code);
-
-    openunicode(\$UNICODEFH, "Unicode.txt");
+    my($rcode,$rname,$rdec);
+    foreach my $range (@CharinfoRanges){
+      if($range-> [0] <= $code && $code <= $range->[1]){
+        $rcode = $hexk;
+        $rname = $range-> [2] ? $range->[2]->($code) : '';
+        $rdec  = $range-> [3] ? $range->[3]->($code) : '';
+        $hexk  = sprintf("%04X",$range-> [0]); # replace by the first
+        last;
+      }
+    }
+    openunicode(\$UNICODEFH, "Unicode.sort"); # sorted
     if (defined $UNICODEFH) {
 	use Search::Dict;
 	if (look($UNICODEFH, "$hexk;") > = 0) {
@@ -143,6 +258,11 @@
 	    if ($prop{code} eq $hexk) {
 		$prop{block}  = charblock($code);
 		$prop{script} = charscript($code);
+		if(defined $rname){
+                    $prop{code} = $rcode;
+                    $prop{name} = $rname;
+                    $prop{decomposition} = $rdec;
+                }
 		return \%prop;
 	    }
 	}
diff -urN orig/lib/UnicodeCD.t lib/UnicodeCD.t
--- orig/lib/UnicodeCD.t	Fri Jul 13 00:22:26 2001
+++ lib/UnicodeCD.t	Tue Jul 24 01:37:04 2001
@@ -3,7 +3,7 @@
 use Test;
 use strict;
 
-BEGIN { plan tests =>  111 };
+BEGIN { plan tests =>  111 + 17 * 3};
 
 use UnicodeCD 'charinfo';
 
@@ -92,6 +92,70 @@
 ok($charinfo-> {title},          '');
 ok($charinfo-> {block},          'Hebrew');
 ok($charinfo-> {script},         'Hebrew');
+
+# an open syllable in Hangul
+
+$charinfo = charinfo(0xAC00);
+
+ok($charinfo-> {code},           'AC00');
+ok($charinfo-> {name},           'HANGUL SYLLABLE GA');
+ok($charinfo-> {category},       'Lo');
+ok($charinfo-> {combining},      '0');
+ok($charinfo-> {bidi},           'L');
+ok($charinfo-> {decomposition},  '1100 1161');
+ok($charinfo-> {decimal},        '');
+ok($charinfo-> {digit},          '');
+ok($charinfo-> {numeric},        '');
+ok($charinfo-> {mirrored},       'N');
+ok($charinfo-> {unicode10},      '');
+ok($charinfo-> {comment},        '');
+ok($charinfo-> {upper},          '');
+ok($charinfo-> {lower},          '');
+ok($charinfo-> {title},          '');
+ok($charinfo-> {block},          'Hangul Syllables');
+ok($charinfo-> {script},         'Hangul');
+
+# a close syllable in Hangul
+
+$charinfo = charinfo(0xAE00);
+
+ok($charinfo-> {code},           'AE00');
+ok($charinfo-> {name},           'HANGUL SYLLABLE GEUL');
+ok($charinfo-> {category},       'Lo');
+ok($charinfo-> {combining},      '0');
+ok($charinfo-> {bidi},           'L');
+ok($charinfo-> {decomposition},  '1100 1173 11AF');
+ok($charinfo-> {decimal},        '');
+ok($charinfo-> {digit},          '');
+ok($charinfo-> {numeric},        '');
+ok($charinfo-> {mirrored},       'N');
+ok($charinfo-> {unicode10},      '');
+ok($charinfo-> {comment},        '');
+ok($charinfo-> {upper},          '');
+ok($charinfo-> {lower},          '');
+ok($charinfo-> {title},          '');
+ok($charinfo-> {block},          'Hangul Syllables');
+ok($charinfo-> {script},         'Hangul');
+
+$charinfo = charinfo(0x1D400);
+
+ok($charinfo-> {code},           '1D400');
+ok($charinfo-> {name},           'MATHEMATICAL BOLD CAPITAL A');
+ok($charinfo-> {category},       'Lu');
+ok($charinfo-> {combining},      '0');
+ok($charinfo-> {bidi},           'L');
+ok($charinfo-> {decomposition},  '<font> 0041');
+ok($charinfo-> {decimal},        '');
+ok($charinfo-> {digit},          '');
+ok($charinfo-> {numeric},        '');
+ok($charinfo-> {mirrored},       'N');
+ok($charinfo-> {unicode10},      '');
+ok($charinfo-> {comment},        '');
+ok($charinfo-> {upper},          '');
+ok($charinfo-> {lower},          '');
+ok($charinfo-> {title},          '');
+ok($charinfo-> {block},          'Mathematical Alphanumeric Symbols');
+ok($charinfo-> {script},         undef);
 
 use UnicodeCD qw(charblock charscript);
 
##### END OF PATCH #####

-----
regards,
SADAHIRO Tomoyuki
E-mail: bqw10602@nifty.com
Thread:
SADAHIRO Tomoyuki
Nicholas Clark
Simon Cozens

Privacy Policy | Email Opt-out | Feedback | Syndication
© ActiveState Software Inc. All rights reserved