kconv.rb   [plain text]


require 'nkf'

module Kconv
  #Constant of Encoding
  AUTO = ::NKF::AUTO
  JIS = ::NKF::JIS
  EUC = ::NKF::EUC
  SJIS = ::NKF::SJIS
  BINARY = ::NKF::BINARY
  NOCONV = ::NKF::NOCONV
  ASCII = ::NKF::ASCII
  UTF8 = ::NKF::UTF8
  UTF16 = ::NKF::UTF16
  UTF32 = ::NKF::UTF32
  UNKNOWN = ::NKF::UNKNOWN
  
  #Regexp of Encoding
  Iconv_Shift_JIS = /\A(?:
		  [\x00-\x7f\xa1-\xdf]                                             |
		  \x81[\x40-\x7e\x80-\xac\xb8-\xbf\xc8-\xce\xda-\xe8\xf0-\xf7\xfc] |
		  \x82[\x4f-\x58\x60-\x79\x81-\x9a\x9f-\xf1]                       |
		  \x83[\x40-\x7e\x80-\x96\x9f-\xb6\xbf-\xd6\x40-\x60]              |
		  \x84[\x40-\x60\x70-\x7e\x80-\x91\x9f-\xbe\x9f-\xfc]              |
		  [\x89-\x8f\x90-\x97\x99-\x9f\xe0-\xea][\x40-\x7e]                |
		  [\x89-\x97\x99-\x9f\xe0-\xe9][\x80-\xfc]                         |
		  \x98[\x40-\x72\x9f-\xfc]                                         |
		  \xea[\x80-\xa4]
		 )*\z/nx
  Iconv_EUC_JP = /\A(?:
	       [\x00-\x7f]                                             |
	       \x8e        [\xa1-\xdf]                                 |
	       \x8f        [\xa1-\xdf] [\xa1-\xdf]                     |
	       [\xa1\xb0-\xbce\xd0-\xf3][\xa1-\xfe]                    |
	       \xa2[\xa1-\xae\xba-\xc1\xca-\xd0\xdc-\xea\xf2-\xf9\xfe] |
	       \xa3[\xb0-\xb9\xc1-\xda\xe1-\xfa]                       |
	       \xa4[\xa1-\xf3]                                         |
	       \xa5[\xa1-\xf6]                                         |
	       \xa6[\xa1-\xb8\xc1-\xd8]                                |
	       \xa7[\xa1-\xc1\xd1-\xf1]                                |
	       \xa8[\xa1-\xc0]                                         |
	       \xcf[\xa1-\xd3]                                         |
	       \xf4[\xa1-\xa6]
	      )*\z/nx
  Iconv_UTF8  = /\A(?:\xef\xbb\xbf)?(?:
  [\x00-\x7f]                                                       |
  \xc2[\x80-\x8d\x90-\x9f\xa1\xaa\xac\xae-\xb1\xb4\xb6\xb8\xba\xbf] |
  \xc3[\x80-\xbf]                                                   |
  \xc4[\x80-\x93\x96-\xa2\xa4-\xab\xae-\xbf]                        |
  \xc5[\x80-\x8d\x90-\xbe]                                          |
  \xc7[\x8d-\x9c\xb5]                                               |
  \xcb[\x87\x98-\x9b\x9d]                                           |
  \xce[\x84-\x86\x88-\x8a\x8c\x8e-\xa1\xa3-\xbf]                    |
  \xcf[\x80-\x8e]                                                   |
  \xd0[\x81-\x8c\x8e-\xbf]                                          |
  \xd1[\x80-\x8f\x91-\x9f]                                          |
  \xe2\x84[\x83\x96\xa2\xab]                                        |
  \xe2\x86[\x83\x91-\x93\x96\xa2\xab]                               |
  \xe2\x87[\x83\x91-\x94\x96\xa2\xab]                               |
  \xe2\x88[\x82-\x83\x87-\x88\x8b\x91-\x94\x96\x9a\x9d-\x9e\xa0\xa2\xa7-\xac\xb4-\xb5\xbd]  |
  \xe2\x89[\x82-\x83\x87-\x88\x8b\x91-\x94\x96\x9a\x9d-\x9e\xa0-\xa2\xa6-\xac\xb4-\xb5\xbd] |
  \xe2[\x8a\x8c][\x82-\x83\x86-\x88\x8b\x91-\x94\x96\x9a\x9d-\x9e\xa0-\xa2\xa5-\xac\xb4-\xb5\xbd] |
  \xe2[\x94-\x99][\x81-\x83\x86-\x88\x8b-\x8c\x8f-\x94\x96-\x98\x9a-\x9e\xa0-\xac\xaf-\xb0\xb3-\xb5\xb7-\xb8\xbb-\xbd\xbf] |
  \xe3\x80[\x81-\x83\x85-\x98\x9a-\x9e\xa0-\xad\xaf-\xb0\xb2-\xb5\xb7-\xb8\xbb-\xbd\xbf] |
  \xe3[\x81-\x83\xb8-\xbf][\x81-\xbf]          |
  [\xe5-\xe7][\x80-\xbf][\x81-\xbf]            |
  \xe8[\x80-\xae\xb0-\xbf][\x81-\xbf]          |
  \xe9[\x80-\x92\x95-\xb1\xb3-\xbe][\x81-\xbf] |
  \xef[\xbc-\xbe][\x81-\xbf]                   |
  )*\z/nx
  RegexpShiftjis = /\A(?:
		       [\x00-\x7f\xa1-\xdf] |
		       [\x81-\x9f\xe0-\xfc][\x40-\x7e\x80-\xfc] 
		      )*\z/nx
  RegexpEucjp = /\A(?:
		    [\x00-\x7f]                         |
		    \x8e        [\xa1-\xdf]             |
		    \x8f        [\xa1-\xdf] [\xa1-\xdf] |
		    [\xa1-\xdf] [\xa1-\xdf]
		   )*\z/nx
  RegexpUtf8  = /\A(?:
		    [\x00-\x7f]                                     |
		    [\xc2-\xdf] [\x80-\xbf]                         |
		    \xe0        [\xa0-\xbf] [\x80-\xbf]             |
		    [\xe1-\xef] [\x80-\xbf] [\x80-\xbf]             |
		    \xf0        [\x90-\xbf] [\x80-\xbf] [\x80-\xbf] |
		    [\xf1-\xf3] [\x80-\xbf] [\x80-\xbf] [\x80-\xbf] |
		    \xf4        [\x80-\x8f] [\x80-\xbf] [\x80-\xbf]
		   )*\z/nx

  #
  # kconv
  #
  
  def kconv(str, out_code, in_code = AUTO)
    opt = '-'
    case in_code
    when ::NKF::JIS
      opt << 'J'
    when ::NKF::EUC
      opt << 'E'
    when ::NKF::SJIS
      opt << 'S'
    when ::NKF::UTF8
    when ::NKF::UTF16
      opt << 'W'
    end

    case out_code
    when ::NKF::JIS
      opt << 'j'
    when ::NKF::EUC
      opt << 'e'
    when ::NKF::SJIS
      opt << 's'
    when ::NKF::UTF8
    when ::NKF::UTF16
      opt << 'w'
    when ::NKF::NOCONV
      return str
    end

    opt = '' if opt == '-'

    ::NKF::nkf(opt, str)
  end
  module_function :kconv

  #
  # Encode to
  #

  def tojis(str)
    ::NKF::nkf('-j', str)
  end
  module_function :tojis

  def toeuc(str)
    ::NKF::nkf('-e', str)
  end
  module_function :toeuc

  def tosjis(str)
    ::NKF::nkf('-s', str)
  end
  module_function :tosjis

  def toutf8(str)
    ::NKF::nkf('-w', str)
  end
  module_function :toutf8

  def toutf16(str)
    ::NKF::nkf('-w16', str)
  end
  module_function :toutf16

  #
  # guess
  #

  def guess(str)
    ::NKF::guess(str)
  end
  module_function :guess

  def guess_old(str)
    ::NKF::guess_old(str)
  end
  module_function :guess_old

  #
  # isEncoding
  #

  def iseuc(str)
    RegexpEucjp.match( str )
  end
  module_function :iseuc

  def issjis(str)
    RegexpShiftjis.match( str )
  end
  module_function :issjis

  def isutf8(str)
    RegexpUtf8.match( str )
  end
  module_function :isutf8

end

class String
  def kconv(out_code, in_code=Kconv::AUTO)
    Kconv::kconv(self, out_code, in_code)
  end
  
  # to Encoding
  def tojis
    ::NKF::nkf('-j', self)
  end
  def toeuc
    ::NKF::nkf('-e', self)
  end
  def tosjis
    ::NKF::nkf('-s', self)
  end
  def toutf8
    ::NKF::nkf('-w', self)
  end
  def toutf16
    ::NKF::nkf('-w16', self)
  end
  
  # is Encoding
  def iseuc
    Kconv.iseuc( self )
  end

  def issjis
    Kconv.issjis( self )
  end

  def isutf8
    Kconv.isutf8( self )
  end
end