Calibre OPDS (and HTML) PHP Server : web-based light alternative to Calibre content server / Calibre2OPDS to serve ebooks (epub, mobi, pdf, ...) http://blog.slucas.fr/en/oss/calibre-opds-php-server
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

212 lines
6.3KB

  1. <?php
  2. /**
  3. * @file
  4. * Transliteration processing functions.
  5. */
  6. /**
  7. * Transliterates UTF-8 encoded text to US-ASCII.
  8. *
  9. * Based on Mediawiki's UtfNormal::quickIsNFCVerify().
  10. * Based on Drupal 7 transliteration module.
  11. *
  12. * @param $string
  13. * UTF-8 encoded text input.
  14. * @param $unknown
  15. * Replacement string for characters that do not have a suitable ASCII
  16. * equivalent.
  17. * @param $source_langcode
  18. * Optional ISO 639 language code that denotes the language of the input and
  19. * is used to apply language-specific variations. If the source language is
  20. * not known at the time of transliteration, it is recommended to set this
  21. * argument to the site default language to produce consistent results.
  22. * Otherwise the current display language will be used.
  23. * @return
  24. * Transliterated text.
  25. */
  26. function _transliteration_process($string, $unknown = '?', $source_langcode = NULL) {
  27. // ASCII is always valid NFC! If we're only ever given plain ASCII, we can
  28. // avoid the overhead of initializing the decomposition tables by skipping
  29. // out early.
  30. if (!preg_match('/[\x80-\xff]/', $string)) {
  31. return $string;
  32. }
  33. static $tail_bytes;
  34. if (!isset($tail_bytes)) {
  35. // Each UTF-8 head byte is followed by a certain number of tail bytes.
  36. $tail_bytes = array();
  37. for ($n = 0; $n < 256; $n++) {
  38. if ($n < 0xc0) {
  39. $remaining = 0;
  40. }
  41. elseif ($n < 0xe0) {
  42. $remaining = 1;
  43. }
  44. elseif ($n < 0xf0) {
  45. $remaining = 2;
  46. }
  47. elseif ($n < 0xf8) {
  48. $remaining = 3;
  49. }
  50. elseif ($n < 0xfc) {
  51. $remaining = 4;
  52. }
  53. elseif ($n < 0xfe) {
  54. $remaining = 5;
  55. }
  56. else {
  57. $remaining = 0;
  58. }
  59. $tail_bytes[chr($n)] = $remaining;
  60. }
  61. }
  62. // Chop the text into pure-ASCII and non-ASCII areas; large ASCII parts can
  63. // be handled much more quickly. Don't chop up Unicode areas for punctuation,
  64. // though, that wastes energy.
  65. preg_match_all('/[\x00-\x7f]+|[\x80-\xff][\x00-\x40\x5b-\x5f\x7b-\xff]*/', $string, $matches);
  66. $result = '';
  67. foreach ($matches[0] as $str) {
  68. if ($str[0] < "\x80") {
  69. // ASCII chunk: guaranteed to be valid UTF-8 and in normal form C, so
  70. // skip over it.
  71. $result .= $str;
  72. continue;
  73. }
  74. // We'll have to examine the chunk byte by byte to ensure that it consists
  75. // of valid UTF-8 sequences, and to see if any of them might not be
  76. // normalized.
  77. //
  78. // Since PHP is not the fastest language on earth, some of this code is a
  79. // little ugly with inner loop optimizations.
  80. $head = '';
  81. $chunk = strlen($str);
  82. // Counting down is faster. I'm *so* sorry.
  83. $len = $chunk + 1;
  84. for ($i = -1; --$len; ) {
  85. $c = $str[++$i];
  86. if ($remaining = $tail_bytes[$c]) {
  87. // UTF-8 head byte!
  88. $sequence = $head = $c;
  89. do {
  90. // Look for the defined number of tail bytes...
  91. if (--$len && ($c = $str[++$i]) >= "\x80" && $c < "\xc0") {
  92. // Legal tail bytes are nice.
  93. $sequence .= $c;
  94. }
  95. else {
  96. if ($len == 0) {
  97. // Premature end of string! Drop a replacement character into
  98. // output to represent the invalid UTF-8 sequence.
  99. $result .= $unknown;
  100. break 2;
  101. }
  102. else {
  103. // Illegal tail byte; abandon the sequence.
  104. $result .= $unknown;
  105. // Back up and reprocess this byte; it may itself be a legal
  106. // ASCII or UTF-8 sequence head.
  107. --$i;
  108. ++$len;
  109. continue 2;
  110. }
  111. }
  112. } while (--$remaining);
  113. $n = ord($head);
  114. if ($n <= 0xdf) {
  115. $ord = ($n - 192) * 64 + (ord($sequence[1]) - 128);
  116. }
  117. elseif ($n <= 0xef) {
  118. $ord = ($n - 224) * 4096 + (ord($sequence[1]) - 128) * 64 + (ord($sequence[2]) - 128);
  119. }
  120. elseif ($n <= 0xf7) {
  121. $ord = ($n - 240) * 262144 + (ord($sequence[1]) - 128) * 4096 + (ord($sequence[2]) - 128) * 64 + (ord($sequence[3]) - 128);
  122. }
  123. elseif ($n <= 0xfb) {
  124. $ord = ($n - 248) * 16777216 + (ord($sequence[1]) - 128) * 262144 + (ord($sequence[2]) - 128) * 4096 + (ord($sequence[3]) - 128) * 64 + (ord($sequence[4]) - 128);
  125. }
  126. elseif ($n <= 0xfd) {
  127. $ord = ($n - 252) * 1073741824 + (ord($sequence[1]) - 128) * 16777216 + (ord($sequence[2]) - 128) * 262144 + (ord($sequence[3]) - 128) * 4096 + (ord($sequence[4]) - 128) * 64 + (ord($sequence[5]) - 128);
  128. }
  129. $result .= _transliteration_replace($ord, $unknown, $source_langcode);
  130. $head = '';
  131. }
  132. elseif ($c < "\x80") {
  133. // ASCII byte.
  134. $result .= $c;
  135. $head = '';
  136. }
  137. elseif ($c < "\xc0") {
  138. // Illegal tail bytes.
  139. if ($head == '') {
  140. $result .= $unknown;
  141. }
  142. }
  143. else {
  144. // Miscellaneous freaks.
  145. $result .= $unknown;
  146. $head = '';
  147. }
  148. }
  149. }
  150. return $result;
  151. }
  152. /**
  153. * Replaces a Unicode character using the transliteration database.
  154. *
  155. * @param $ord
  156. * An ordinal Unicode character code.
  157. * @param $unknown
  158. * Replacement string for characters that do not have a suitable ASCII
  159. * equivalent.
  160. * @param $langcode
  161. * Optional ISO 639 language code that denotes the language of the input and
  162. * is used to apply language-specific variations. Defaults to the current
  163. * display language.
  164. * @return
  165. * ASCII replacement character.
  166. */
  167. function _transliteration_replace($ord, $unknown = '?', $langcode = NULL) {
  168. static $map = array();
  169. //GL: set language later
  170. /*
  171. if (!isset($langcode)) {
  172. global $language;
  173. $langcode = $language->language;
  174. }
  175. */
  176. $bank = $ord >> 8;
  177. if (!isset($map[$bank][$langcode])) {
  178. $file = './resources/transliteration-data/' . sprintf('x%02x', $bank) . '.php';
  179. if (file_exists($file)) {
  180. include $file;
  181. if ($langcode != 'en' && isset($variant[$langcode])) {
  182. // Merge in language specific mappings.
  183. $map[$bank][$langcode] = $variant[$langcode] + $base;
  184. }
  185. else {
  186. $map[$bank][$langcode] = $base;
  187. }
  188. }
  189. else {
  190. $map[$bank][$langcode] = array();
  191. }
  192. }
  193. $ord = $ord & 255;
  194. return isset($map[$bank][$langcode][$ord]) ? $map[$bank][$langcode][$ord] : $unknown;
  195. }