iso2022.js 3.97 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141
var util = require('util'),
  Match = require ('../match');


/**
 * This is a superclass for the individual detectors for
 * each of the detectable members of the ISO 2022 family
 * of encodings.
 */

function ISO_2022() {}

ISO_2022.prototype.match = function(det) {

  /**
   * Matching function shared among the 2022 detectors JP, CN and KR
   * Counts up the number of legal an unrecognized escape sequences in
   * the sample of text, and computes a score based on the total number &
   * the proportion that fit the encoding.
   *
   *
   * @param text the byte buffer containing text to analyse
   * @param textLen  the size of the text in the byte.
   * @param escapeSequences the byte escape sequences to test for.
   * @return match quality, in the range of 0-100.
   */

  var i, j;
  var escN;
  var hits   = 0;
  var misses = 0;
  var shifts = 0;
  var quality;

  // TODO: refactor me
  var text = det.fInputBytes;
  var textLen = det.fInputLen;

  scanInput:
    for (i = 0; i < textLen; i++) {
      if (text[i] == 0x1b) {
        checkEscapes:
          for (escN = 0; escN < this.escapeSequences.length; escN++) {
            var seq = this.escapeSequences[escN];

            if ((textLen - i) < seq.length)
              continue checkEscapes;

            for (j = 1; j < seq.length; j++)
              if (seq[j] != text[i + j])
                continue checkEscapes;


            hits++;
            i += seq.length - 1;
            continue scanInput;
          }

          misses++;
      }

      // Shift in/out
      if (text[i] == 0x0e || text[i] == 0x0f)
        shifts++;

    }

  if (hits == 0)
    return null;

  //
  // Initial quality is based on relative proportion of recongized vs.
  //   unrecognized escape sequences.
  //   All good:  quality = 100;
  //   half or less good: quality = 0;
  //   linear inbetween.
  quality = (100 * hits - 100 * misses) / (hits + misses);

  // Back off quality if there were too few escape sequences seen.
  //   Include shifts in this computation, so that KR does not get penalized
  //   for having only a single Escape sequence, but many shifts.
  if (hits + shifts < 5)
    quality -= (5 - (hits + shifts)) * 10;

  return quality <= 0 ? null : new Match(det, this, quality);
};

module.exports.ISO_2022_JP = function() {
  this.name = function() {
    return 'ISO-2022-JP';
  };
  this.escapeSequences = [
    [ 0x1b, 0x24, 0x28, 0x43 ],   // KS X 1001:1992
    [ 0x1b, 0x24, 0x28, 0x44 ],   // JIS X 212-1990
    [ 0x1b, 0x24, 0x40 ],         // JIS C 6226-1978
    [ 0x1b, 0x24, 0x41 ],         // GB 2312-80
    [ 0x1b, 0x24, 0x42 ],         // JIS X 208-1983
    [ 0x1b, 0x26, 0x40 ],         // JIS X 208 1990, 1997
    [ 0x1b, 0x28, 0x42 ],         // ASCII
    [ 0x1b, 0x28, 0x48 ],         // JIS-Roman
    [ 0x1b, 0x28, 0x49 ],         // Half-width katakana
    [ 0x1b, 0x28, 0x4a ],         // JIS-Roman
    [ 0x1b, 0x2e, 0x41 ],         // ISO 8859-1
    [ 0x1b, 0x2e, 0x46 ]          // ISO 8859-7
  ];
};
util.inherits(module.exports.ISO_2022_JP, ISO_2022);



module.exports.ISO_2022_KR = function() {
  this.name = function() {
    return 'ISO-2022-KR';
  };
  this.escapeSequences = [
    [ 0x1b, 0x24, 0x29, 0x43 ]
  ];
};
util.inherits(module.exports.ISO_2022_KR, ISO_2022);



module.exports.ISO_2022_CN = function() {
  this.name = function() {
    return 'ISO-2022-CN';
  };
  this.escapeSequences = [
    [ 0x1b, 0x24, 0x29, 0x41 ],   // GB 2312-80
    [ 0x1b, 0x24, 0x29, 0x47 ],   // CNS 11643-1992 Plane 1
    [ 0x1b, 0x24, 0x2A, 0x48 ],   // CNS 11643-1992 Plane 2
    [ 0x1b, 0x24, 0x29, 0x45 ],   // ISO-IR-165
    [ 0x1b, 0x24, 0x2B, 0x49 ],   // CNS 11643-1992 Plane 3
    [ 0x1b, 0x24, 0x2B, 0x4A ],   // CNS 11643-1992 Plane 4
    [ 0x1b, 0x24, 0x2B, 0x4B ],   // CNS 11643-1992 Plane 5
    [ 0x1b, 0x24, 0x2B, 0x4C ],   // CNS 11643-1992 Plane 6
    [ 0x1b, 0x24, 0x2B, 0x4D ],   // CNS 11643-1992 Plane 7
    [ 0x1b, 0x4e ],               // SS2
    [ 0x1b, 0x4f ]                // SS3
  ];
};
util.inherits(module.exports.ISO_2022_CN, ISO_2022);