日常工作中,我们可能会遇到处理不同格式编码的文本文件的需求,这个问题如果处理不好,就会有中文乱码等棘手的问题。
以下这个文章写的很不错:
阮一峰:字符编码笔记:ASCII,Unicode和UTF-8
http://www.ruanyifeng.com/blog/2007/10/ascii_unicode_and_utf-8.html
理解这些编码知识后,我们就可以编写C++代码来完成任务了。下面的函数会把UTF-8, UNICODE Big-endian, UNICODE Little-endian这三种格式编码的文本文件转换成ANSI文本文件。参数为文本文件路径。只用于Windows平台。
1 /* 2 * 将路径fpath所指的文件从各种编码格式,转换为ANSI格式 3 * 4 * Copyright (c) 2013 赵子清, All rights reserved. 5 * 6 */ 7 8 #define CODE_FORMAT_ANSI 1 9 #define CODE_FORMAT_UTF8 2 10 #define CODE_FORMAT_UNICODE_LE 3 11 #define CODE_FORMAT_UNICODE_BE 4 12 13 typedef int ErrorCode; 14 #define ERR_OK 0 15 #define ERR_FILE_OPEN_FAILED 1001 16 17 #define SWAP16(x) \ 18 ((((x) & 0x00ff) << 8) | \ 19 (((x) & 0xff00) >> 8) ) 20 21 #define SWAP32(x) \ 22 ((((x) & 0x000000ff) << 24) | \ 23 (((x) & 0x0000ff00) << 8) | \ 24 (((x) & 0x00ff0000) >> 8) | \ 25 (((x) & 0xff000000) >> 24) ) 26 27 #define SAFE_DELETE(x) if((x)!=0) {delete[] (x); (x) = 0; } 28 29 ErrorCode ConvertFormat(const char* fpath) 30 { 31 #ifdef _MSC_VER 32 assert(fpath != 0); 33 34 FILE* fp = ::fopen(fpath, "rb"); 35 if(fp == NULL) 36 return ERR_FILE_OPEN_FAILED; 37 38 int fmtFlag; 39 int fmt = CODE_FORMAT_ANSI; 40 ::fread(&fmtFlag, sizeof(int), 1, fp); 41 ::fclose(fp); 42 43 fmtFlag = SWAP32(fmtFlag); 44 if((fmtFlag & 0xffffff00) == 0xefbbbf00) 45 fmt = CODE_FORMAT_UTF8; 46 else if((fmtFlag & 0xffff0000) == 0xfffe0000) 47 fmt = CODE_FORMAT_UNICODE_LE; 48 else if((fmtFlag & 0xffff0000) == 0xfeff0000) 49 fmt = CODE_FORMAT_UNICODE_BE; 50 51 52 if(fmt == CODE_FORMAT_ANSI) 53 return ERR_OK; 54 55 fp = ::fopen(fpath, "rb"); 56 char* txt = 0; 57 wchar_t* wtxt = 0; 58 long flen = 0L; 59 ::fseek(fp, 0L, SEEK_END); 60 flen = ftell(fp); 61 ::rewind(fp); 62 if(fmt == CODE_FORMAT_UTF8) 63 { 64 txt = new char[flen+1]; 65 ::fread(txt, 1, flen, fp); 66 txt[flen] = '\0'; 67 } 68 else if(fmt == CODE_FORMAT_UNICODE_LE || fmt == CODE_FORMAT_UNICODE_BE) 69 { 70 wtxt = new wchar_t[flen/2 +1]; 71 ::fread(wtxt, 2, flen/2, fp); 72 if(fmt == CODE_FORMAT_UNICODE_BE) 73 { 74 for(int i=0; i < flen/2; i++) 75 wtxt[i] = SWAP16(wtxt[i]); 76 } 77 wtxt[flen/2] = L'\0'; 78 } 79 80 ::fclose(fp); 81 82 int nLen; 83 wchar_t* pwstr = 0; 84 char* pstr = 0; 85 switch (fmt) 86 { 87 case CODE_FORMAT_UTF8: 88 nLen = ::MultiByteToWideChar(CP_UTF8, 0, txt+3, -1, NULL, 0); 89 pwstr = new wchar_t[nLen+1]; 90 nLen = ::MultiByteToWideChar(CP_UTF8, 0, txt+3, -1, pwstr, nLen); 91 nLen = ::WideCharToMultiByte(CP_ACP, 0, pwstr, -1, NULL, 0, NULL, NULL); 92 pstr = new char[nLen]; 93 ::memset(pstr, 0, nLen); 94 nLen = ::WideCharToMultiByte(CP_ACP, 0, pwstr, -1, pstr, 95 nLen, NULL, NULL); 96 break; 97 case CODE_FORMAT_UNICODE_LE: 98 case CODE_FORMAT_UNICODE_BE: 99 nLen = ::WideCharToMultiByte(CP_ACP, 0, wtxt+1, -1, NULL, 0, NULL, NULL); 100 pstr = new char[nLen]; 101 ::memset(pstr, 0, nLen); 102 nLen = ::WideCharToMultiByte(CP_ACP, 0, wtxt+1, -1, pstr, 103 nLen, NULL, NULL); 104 break; 105 default: 106 break; 107 } 108 109 fp = ::fopen(fpath, "wb"); 110 ::fwrite(pstr, 1, nLen-1, fp); 111 ::fclose(fp); 112 113 SAFE_DELETE(txt); 114 SAFE_DELETE(wtxt); 115 SAFE_DELETE(pstr); 116 SAFE_DELETE(pwstr); 117 118 #endif 119 120 return ERR_OK; 121 }
转载于:https://www.cnblogs.com/zzqcn/archive/2013/04/25/3043730.html