GB2312/UTF-8编码互转

最常用的编码转换就是GB2312 -> UTF-8的转换了,GB2312是简体中文Windows的默认编码,在记事本另存为时选择的ANSI就是GB2312编码(ANSI在不同版本的操作系统中指代不同编码,仅简体中文系统中表示GB2312),GBK是GB2312的超集,GB18030是GBK的超集,相比GB2312扩充的内容包括繁体字、日韩语中的汉字、少数民族的汉字等不常用汉字。UTF-8是Linux的默认编码,UTF-8和GB2312两种编码都兼容ASCII编码,UTF-8的编码设计更灵活,是变长的编码,理论上可以无限扩充下去,可以把简体字繁体字日文韩文以及未来可能出现的新文字和符号都定义在内。

现在我们只考虑GB2312和UTF-8都能表达的简体中文以及ASCII的部分的相互转换,这也是最常用的转换。

在Linux上我们可以使用iconv

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
#pragma once

#include <string>
#include <iconv.h>

class EncodingConvertor
{
public:
// "UTF-8", "GBK", "GB2312", "GB18030"
EncodingConvertor(const char* toEncoding, const char* fromEncoding);

~EncodingConvertor();

std::string Convert(const char* in, size_t in_len);
std::string Convert(const char* in);
std::string Convert(const std::string& in);

private:
iconv_t _iconv;
};

std::string toUTF8(const std::string& s);
std::string toGB2312(const std::string& s);
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
#include "EncodingConvertor.h"
#include <cstring>

using namespace std;

EncodingConvertor::EncodingConvertor(const char* toEncoding, const char* fromEncoding) {
_iconv = iconv_open(toEncoding, fromEncoding);
}

EncodingConvertor::~EncodingConvertor() {
iconv_close(_iconv);
}


std::string EncodingConvertor::Convert(const char* in, size_t inlen) {
constexpr size_t BLOCK_SIZE = 1024; // iconv可以分块转换,对于特别长的字符串(特别大的文件)
size_t outbuflen = std::max(inlen * 2, BLOCK_SIZE);
const char* pin = in;
char* outbuf = (char*)malloc(outbuflen);
char* pout = outbuf;

int ret = 0;
do {
size_t perlen = BLOCK_SIZE;
if (pout + perlen > outbuf + outbuflen - 1) {
// 如果outbuf后面剩余空间不足一个转换块大小,就重新分配空间
int pout_offset = pout - outbuf;
outbuflen = std::max(outbuflen * 2, BLOCK_SIZE);
outbuf = (char*)realloc(outbuf, outbuflen);
pout = outbuf + pout_offset; // outbuf可能指向新地址,所以要把pout偏移过去
}
ret = iconv(_iconv, (char**)&pin, &inlen, &pout, &perlen);

if (perlen == BLOCK_SIZE) {
// 转移前后perlen没有变化应该是出错了
outbuf[0] = 0;
break;
}
} while (ret == -1);
*pout = 0;
std::string r(outbuf, pout - outbuf);
free(outbuf);
return r;
}

std::string EncodingConvertor::Convert(const char* in) {
return this->Convert(in, strlen(in));
}

std::string EncodingConvertor::Convert(const std::string& in) {
return this->Convert(in.c_str(), in.length());
}

static bool isUTF8(const char* s) {
const unsigned char* rawtext = (const unsigned char*)s;
int i, rawtextlen = 0;
int goodbytes = 0, asciibytes = 0;

// Maybe also use UTF8 Byte Order Mark: EF BB BF

// Check to see if characters fit into acceptable ranges
rawtextlen = strlen(s);
for (i = 0; i < rawtextlen; i++) {
if ((rawtext[i] & 0x7F) == rawtext[i]) { // One byte
asciibytes++;
// Ignore ASCII, can throw off count
} else {
int m_rawInt0 = (int)(unsigned char)(rawtext[i]);
int m_rawInt1 = (int)(unsigned char)(rawtext[i + 1]);
int m_rawInt2 = (int)(unsigned char)(rawtext[i + 2]);

if (256 - 64 <= m_rawInt0 && m_rawInt0 <= 256 - 33 && // Two bytes
i + 1 < rawtextlen &&
256 - 128 <= m_rawInt1 && m_rawInt1 <= 256 - 65) {
goodbytes += 2;
i++;
} else if (256 - 32 <= m_rawInt0 && m_rawInt0 <= 256 - 17 && // Three bytes
i + 2 < rawtextlen &&
256 - 128 <= m_rawInt1 && m_rawInt1 <= 256 - 65 &&
256 - 128 <= m_rawInt2 && m_rawInt2 <= 256 - 65) {
goodbytes += 3;
i += 2;
}
}
}

// 全都是ASCII码
if (asciibytes == rawtextlen) { return true; }

// 非ASCII码的均符合UTF-8规则
return (goodbytes == rawtextlen - asciibytes);
}

std::string toUTF8(const std::string& s) {
if(isUTF8(s.c_str())) return s;
return EncodingConvertor("UTF-8", "GB2312").Convert(s);
}

std::string toGB2312(const std::string& s) {
if(isUTF8(s.c_str())) return EncodingConvertor("GB2312", "UTF-8").Convert(s);
else return s;
}

在Windows可以通过WideCharToMultiByteMultiByteToWideChar两个方法完成

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
wchar_t * ANSIToUnicode( const char* str ) {
int textlen ;
wchar_t * result;
textlen = MultiByteToWideChar( CP_ACP, 0, str,-1, NULL,0 );
result = (wchar_t *)malloc((textlen+1)*sizeof(wchar_t));
memset(result,0,(textlen+1)*sizeof(wchar_t));
MultiByteToWideChar(CP_ACP, 0,str,-1,(LPWSTR)result,textlen );
return result;
}

char * UnicodeToANSI( const wchar_t *str ) {
char * result;
int textlen;
// wide char to multi char
textlen = WideCharToMultiByte( CP_ACP, 0, str, -1, NULL, 0, NULL, NULL );
result =(char *)malloc((textlen+1)*sizeof(char));
memset( result, 0, sizeof(char) * ( textlen + 1 ) );
WideCharToMultiByte( CP_ACP, 0, str, -1, result, textlen, NULL, NULL );
return result;
}

wchar_t * UTF8ToUnicode( const char* str ) {
int textlen ;
wchar_t * result;
textlen = MultiByteToWideChar( CP_UTF8, 0, str,-1, NULL,0 );
result = (wchar_t *)malloc((textlen+1)*sizeof(wchar_t));
memset(result,0,(textlen+1)*sizeof(wchar_t));
MultiByteToWideChar(CP_UTF8, 0,str,-1,(LPWSTR)result,textlen );
return result;
}

char * UnicodeToUTF8( const wchar_t *str ) {
char * result;
int textlen;
// wide char to multi char
textlen = WideCharToMultiByte( CP_UTF8, 0, str, -1, NULL, 0, NULL, NULL );
result =(char *)malloc((textlen+1)*sizeof(char));
memset(result, 0, sizeof(char) * ( textlen + 1 ) );
WideCharToMultiByte( CP_UTF8, 0, str, -1, result, textlen, NULL, NULL );
return result;
}