sequences.
+Fixed bug #775: char conversion does not handle Alternative UTF-8
+sequences.
+
Implemented function yaz_marc_read_line which parses MARC line format
records. There is a wealth of formats out there. So far, this function
reads line records produced by yaz_marc_write_line.
# the next line restarts using tclsh \
if [ -f /usr/local/bin/tclsh8.4 ]; then exec tclsh8.4 "$0" "$@"; else exec tclsh "$0" "$@"; fi
#
-# $Id: charconv.tcl,v 1.17 2006-08-30 20:40:18 adam Exp $
+# $Id: charconv.tcl,v 1.18 2006-12-17 15:34:11 adam Exp $
proc usage {} {
puts {charconv.tcl: [-p prefix] [-s split] [-o ofile] file ... }
set marc_lines 0
set ucs_lines 0
set utf_lines 0
+ set altutf_lines 0
set codename_lines 0
set lineno 0
set f [open $fname r]
set tablenumber x
set combining 0
set codename {}
+ set altutf {}
while {1} {
incr lineno
set cnt [gets $f line]
# puts "ins_trie $hex $marc
ins_trie $hex $marc $combining $codename
unset hex
+
} else {
for {set i 0} {$i < [string length $marc]} {incr i 2} {
lappend hex [string range $marc $i [expr $i+1]]
unset hex
}
}
+ if {$reverse && [string length $marc]} {
+ for {set i 0} {$i < [string length $altutf]} {incr i 2} {
+ lappend hex [string range $altutf $i [expr $i+1]]
+ }
+ if {[info exists hex]} {
+ ins_trie $hex $marc $combining $codename
+ unset hex
+ }
+ }
set marc {}
set uni {}
set codename {}
set combining 0
+ set altutf {}
} elseif {[regexp {<marc>([0-9A-Fa-f]*)</marc>} $line s marc]} {
incr marc_lines
} elseif {[regexp {<name>(.*)</name>} $line s codename]} {
incr ucs_lines
} elseif {[regexp {<utf-8>([0-9A-Fa-f]*)</utf-8>} $line s utf]} {
incr utf_lines
+ } elseif {[regexp {<altutf-8>([0-9A-Fa-f]*)</altutf-8>} $line s altutf]} {
+ incr altutf_lines
}
}
close $f
## Copyright (C) 1994-2006, Index Data ApS
## All rights reserved.
-## $Id: Makefile.am,v 1.27 2006-11-29 12:48:59 heikki Exp $
+## $Id: Makefile.am,v 1.28 2006-12-17 15:34:11 adam Exp $
check_PROGRAMS = tsticonv tstnmem tstmatchstr tstwrbuf tstodr tstccl tstlog \
tstsoap1 tstsoap2 tstodrstack tstlogthread tstxmlquery tstpquery \
marc6.marc marc6.xml marc6.chr marc6.xml.marc \
marc7.marc marc7.xml marc7.chr marc7.xml.marc \
marccol1.u8.marc marccol1.u8.1.lst marccol1.u8.2.lst \
+ marccol2.u8.marc marccol2.u8.1.lst marccol2.u8.2.lst \
tst_record_conv.xsl
YAZCOMP = ../util/yaz-asncomp
--- /dev/null
+03103cam a2200337 i 4500
+001 12683849
+005 20051218154744.0
+008 981008b2001 ilu 000 0 eng
+035 $a 57779
+035 $a 90490
+035 $a 93202
+040 $a DLC $c DLC
+906 $a 0 $b und $c orignew $d u $e ncip $f 19 $g y-gencatlg
+010 $a 77123332
+245 00 $a Voyager Diacritic test -- New input 001 (SBIE)
+260 $a ny $b ny, $c 2001.
+300 $a p. $c cm.
+500 $a New copy imported from file (8/12/99)
+500 $a VOYAGER COLUMN 0 (NEW): Degree sign (°); Phono Copyright mark (℗); Copyright mark (©); Sharp (♯); Inverted Question mark (¿); Inverted Exclamation mark (¡)
+500 $a VOYAGER COLUMN 1: Script L (ℓ); Polish L (Ł); Scandanavian O (Ø); D with Crossbar (Đ); Icelandic Thorn (Þ); AE Digraph (Æ); OE Digraph (Œ); Miagkii Znak (ʹ); Dot at Midline (·)
+500 $a VOYAGER COLUMN 2: Musical Flat (♭); Patent Mark (®); Plus or Minus (±); O Hook (Ơ); U Hook (Ư); Alif (ʾ); alpha (DO NOT USE); Ayn (ʻ); Polish l (ł)
+500 $a VOYAGER COLUMN 3: Scandanavian o (ø); d with crossbar (đ); Icelandic Thorn (þ); ae Digraph (æ); oe Digraph (œ); Tverdii Znak (ʺ); Turkish i (ı); British Pound (£); eth (ð)
+500 $a VOYAGER COLUMN 4: Dagger (DO NOT USE); o Hook (ơ); u Hook (ư); Beta (DO NOT USE); Gamma (DO NOT USE); Superscript 0 (⁰); Superscript 1 (¹); Superscript 2 (²); Superscript 3 (³)
+500 $a VOYAGER COLUMN 5: Superscript 4 (⁴); Superscript 5 (⁵); Superscript 6 (⁶); Superscript 7 (⁷); Superscript 8 (⁸); Superscript 9 (⁹); Superscript + (⁺); Superscript - (⁻); Superscript ( (⁽);
+500 $a VOYAGER COLUMN 6: Superscript ) (⁾); Subscript 0 (₀); Subscript 1 (₁); Subscript 2 (₂); Subscript 3 (₃); Subscript 4 (₄); Subscript 5 (₅); Subscript 6 (₆); Subscript 7 (₇)
+500 $a VOYAGER COLUMN 7: Subscript 8 (₈); Subscript 9 (₉); Subscript + (₊); Subscript - (₋); Subscript ( (₍); Subscript ) (₎); Pseudo Question Mark (ỏ); Grave (ò); Acute (ó)
+500 $a VOYAGER COLUMN 8: Circumflex (ô); Tilde (õ); Macron (ō); Breve (ŏ); Superior Dot (ȯ); Umlaut (ö); Hacek (ǒ); Circle Above (o̊); Ligature left (o︠)
+500 $a VOYAGER COLUMN 9: Ligature right (o︡) ; High Comma off center (o̕); Double Acute (ő); Candrabindu (o̐); Cedilla (o̧); Right Hook (ǫ); Dot Below (ọ); Double Dot Below (o̤); Circle Below (o̥)
+500 $a VOYAGER COLUMN 10: Double Underscore (o̳); Underscore (o̲); Left Hook (o̦); Right Cedilla (o̜); Upadhmaniya (o̮); Double Tilde 1st half (o︢); Double Tilde 2nd half (o︣) ; High Comma centered (o̓)
+500 $a VOYAGER PC Keyboard: Spacing Circumflex (^); Spacing Underscore (_); Spacing Grave (`); Open Curly Bracket ({); Close Curly Bracket (}); Spacing Tilde (~)
+500 $a Standard PC Keyboard: 1234567890-= !@#$%^&*()_+ qwertyuiop[]\ QWERTYUIOP{}| asdfghjkl;' ASDFGHJKL:" zxcvbnm,./ ZXCVBNM<>?
+
--- /dev/null
+03093cam a2200337 i 4500
+001 12683849
+005 20051218154744.0
+008 981008b2001 ilu 000 0 eng
+035 $a 57779
+035 $a 90490
+035 $a 93202
+040 $a DLC $c DLC
+906 $a 0 $b und $c orignew $d u $e ncip $f 19 $g y-gencatlg
+010 $a 77123332
+245 00 $a Voyager Diacritic test -- New input 001 (SBIE)
+260 $a ny $b ny, $c 2001.
+300 $a p. $c cm.
+500 $a New copy imported from file (8/12/99)
+500 $a VOYAGER COLUMN 0 (NEW): Degree sign (°); Phono Copyright mark (℗); Copyright mark (©); Sharp (♯); Inverted Question mark (¿); Inverted Exclamation mark (¡)
+500 $a VOYAGER COLUMN 1: Script L (ℓ); Polish L (Ł); Scandanavian O (Ø); D with Crossbar (Đ); Icelandic Thorn (Þ); AE Digraph (Æ); OE Digraph (Œ); Miagkii Znak (ʹ); Dot at Midline (·)
+500 $a VOYAGER COLUMN 2: Musical Flat (♭); Patent Mark (®); Plus or Minus (±); O Hook (Ơ); U Hook (Ư); Alif (ʼ); alpha (DO NOT USE); Ayn (ʻ); Polish l (ł)
+500 $a VOYAGER COLUMN 3: Scandanavian o (ø); d with crossbar (đ); Icelandic Thorn (þ); ae Digraph (æ); oe Digraph (œ); Tverdii Znak (ʺ); Turkish i (ı); British Pound (£); eth (ð)
+500 $a VOYAGER COLUMN 4: Dagger (DO NOT USE); o Hook (ơ); u Hook (ư); Beta (DO NOT USE); Gamma (DO NOT USE); Superscript 0 (⁰); Superscript 1 (¹); Superscript 2 (²); Superscript 3 (³)
+500 $a VOYAGER COLUMN 5: Superscript 4 (⁴); Superscript 5 (⁵); Superscript 6 (⁶); Superscript 7 (⁷); Superscript 8 (⁸); Superscript 9 (⁹); Superscript + (⁺); Superscript - (⁻); Superscript ( (⁽);
+500 $a VOYAGER COLUMN 6: Superscript ) (⁾); Subscript 0 (₀); Subscript 1 (₁); Subscript 2 (₂); Subscript 3 (₃); Subscript 4 (₄); Subscript 5 (₅); Subscript 6 (₆); Subscript 7 (₇)
+500 $a VOYAGER COLUMN 7: Subscript 8 (₈); Subscript 9 (₉); Subscript + (₊); Subscript - (₋); Subscript ( (₍); Subscript ) (₎); Pseudo Question Mark (ỏ); Grave (ò); Acute (ó)
+500 $a VOYAGER COLUMN 8: Circumflex (ô); Tilde (õ); Macron (ō); Breve (ŏ); Superior Dot (ȯ); Umlaut (ö); Hacek (ǒ); Circle Above (o̊); Ligature left (o͡)
+500 $a VOYAGER COLUMN 9: Ligature right (o) ; High Comma off center (o̕); Double Acute (ő); Candrabindu (o̐); Cedilla (o̧); Right Hook (ǫ); Dot Below (ọ); Double Dot Below (o̤); Circle Below (o̥)
+500 $a VOYAGER COLUMN 10: Double Underscore (o̳); Underscore (o̲); Left Hook (o̦); Right Cedilla (o̜); Upadhmaniya (o̮); Double Tilde 1st half (o͠); Double Tilde 2nd half (o) ; High Comma centered (o̓)
+500 $a VOYAGER PC Keyboard: Spacing Circumflex (^); Spacing Underscore (_); Spacing Grave (`); Open Curly Bracket ({); Close Curly Bracket (}); Spacing Tilde (~)
+500 $a Standard PC Keyboard: 1234567890-= !@#$%^&*()_+ qwertyuiop[]\ QWERTYUIOP{}| asdfghjkl;' ASDFGHJKL:" zxcvbnm,./ ZXCVBNM<>?
+
--- /dev/null
+03103cam a2200337 i 4500001000900000005001700009008004100026035001000067035001000077035001000087040001300097906004500110010001700155245005100172260001900223300001200242500004200254500017500296500019900471500017000670500019400840500019701034500022001231500020401451500019801655500017701853500021602030500021802246500016002464500014102624\1e12683849\1e20051218154744.0\1e981008b2001 ilu 000 0 eng \1e \1fa57779\1e \1fa90490\1e \1fa93202\1e \1faDLC\1fcDLC\1e \1fa0\1fbund\1fcorignew\1fdu\1fencip\1ff19\1fgy-gencatlg\1e \1fa 77123332 \1e00\1faVoyager Diacritic test -- New input 001 (SBIE)\1e \1fany\1fbny,\1fc2001.\1e \1fap.\1fccm.\1e \1faNew copy imported from file (8/12/99)\1e \1faVOYAGER COLUMN 0 (NEW): Degree sign (°); Phono Copyright mark (℗); Copyright mark (©); Sharp (♯); Inverted Question mark (¿); Inverted Exclamation mark (¡)\1e \1faVOYAGER COLUMN 1: Script L (ℓ); Polish L (Ł); Scandanavian O (Ø); D with Crossbar (Đ); Icelandic Thorn (Þ); AE Digraph (Æ); OE Digraph (Œ); Miagkii Znak (ʹ); Dot at Midline (·)\1e \1faVOYAGER COLUMN 2: Musical Flat (♭); Patent Mark (®); Plus or Minus (±); O Hook (Ơ); U Hook (Ư); Alif (ʾ); alpha (DO NOT USE); Ayn (ʻ); Polish l (ł)\1e \1faVOYAGER COLUMN 3: Scandanavian o (ø); d with crossbar (đ); Icelandic Thorn (þ); ae Digraph (æ); oe Digraph (œ); Tverdii Znak (ʺ); Turkish i (ı); British Pound (£); eth (ð)\1e \1faVOYAGER COLUMN 4: Dagger (DO NOT USE); o Hook (ơ); u Hook (ư); Beta (DO NOT USE); Gamma (DO NOT USE); Superscript 0 (⁰); Superscript 1 (¹); Superscript 2 (²); Superscript 3 (³)\1e \1faVOYAGER COLUMN 5: Superscript 4 (⁴); Superscript 5 (⁵); Superscript 6 (⁶); Superscript 7 (⁷); Superscript 8 (⁸); Superscript 9 (⁹); Superscript + (⁺); Superscript - (⁻); Superscript ( (⁽);\1e \1faVOYAGER COLUMN 6: Superscript ) (⁾); Subscript 0 (₀); Subscript 1 (₁); Subscript 2 (₂); Subscript 3 (₃); Subscript 4 (₄); Subscript 5 (₅); Subscript 6 (₆); Subscript 7 (₇)\1e \1faVOYAGER COLUMN 7: Subscript 8 (₈); Subscript 9 (₉); Subscript + (₊); Subscript - (₋); Subscript ( (₍); Subscript ) (₎); Pseudo Question Mark (ỏ); Grave (ò); Acute (ó)\1e \1faVOYAGER COLUMN 8: Circumflex (ô); Tilde (õ); Macron (ō); Breve (ŏ); Superior Dot (ȯ); Umlaut (ö); Hacek (ǒ); Circle Above (o̊); Ligature left (o︠)\1e \1faVOYAGER COLUMN 9: Ligature right (o︡) ; High Comma off center (o̕); Double Acute (ő); Candrabindu (o̐); Cedilla (o̧); Right Hook (ǫ); Dot Below (ọ); Double Dot Below (o̤); Circle Below (o̥)\1e \1faVOYAGER COLUMN 10: Double Underscore (o̳); Underscore (o̲); Left Hook (o̦); Right Cedilla (o̜); Upadhmaniya (o̮); Double Tilde 1st half (o︢); Double Tilde 2nd half (o︣) ; High Comma centered (o̓)\1e \1faVOYAGER PC Keyboard: Spacing Circumflex (^); Spacing Underscore (_); Spacing Grave (`); Open Curly Bracket ({); Close Curly Bracket (}); Spacing Tilde (~)\1e \1faStandard PC Keyboard: 1234567890-= !@#$%^&*()_+ qwertyuiop[]\ QWERTYUIOP{}| asdfghjkl;' ASDFGHJKL:" zxcvbnm,./ ZXCVBNM<>?\1e\1d
\ No newline at end of file