经典字符串处理问题

2023年11月10日 223次阅读来源: 游程编码问题

字符串的搜索，匹配，查找，压缩，编码/解码，是一类非常常见的问题。

(1) 压缩一个字符串当中的空格,例如把” I like csdn “压缩成”I like csdn”。注意，单词之间的空格，N个空格要压缩到1个，也就是要减少N-1个空格。行首/行尾的空格全部去掉。
算法复杂度O(n)，扫描一遍，在扫描的过程中记住当前已经有了多少个多余的空格(pos)，然后把非空格的字符都向前移动pos个字符。处理完了整个字符串以后再从行尾往前扫描到底一个非空格字符，再其后面填上’\0’，工作完成。

#include<string>
using namespace std;
void f(char* str){//输入/输出参数
	if(str==nullptr || str[0]=='\0')return;//错误处理
	size_t len = strlen(str);
	size_t pos = 0;//当前已经压缩的空格数量
	bool fSpace = false;
	bool fIsTail = false;
	for(size_t i = 0; i<len; ++ i){
		if(str[i]==' '){
			if(!fSpace){
				fSpace=true;//开始空格子串
				if(i==0){
					++pos;//行首空格
				}
			}else{//有了标志位了,说明是多余空格
				++pos;
			}
		}else{//当前不是空格
			if(fSpace==true)fSpace=false;
			str[i-pos]=str[i];
			str[i]=' ';
		}
	}
	while(str[--len]==' ');
	str[len+1]='\0';
}
int main(){
	char buf[]=" I  like   csdn    ";
	f(buf);//压缩空格
	printf("%s,%d\n",buf,strlen(buf));
	return 0;
}

(2) 汉字编码的题目，例如输入一个数字串”00001204567809001″输出”一万两千零四十五亿六千七百八十万九千零一”

分析: k可以分步骤进行，先对字符串进行错误处理，然后遍历前面的0，找到真正的数字开始的地方。

计算的过程需要将整个字符串分段，8个字符一段(亿)，其中又要再次分为两段(万)，然后对于4位数进行编码

#include"stdafx.h"
#include <iostream>
#include <cassert>
#include <cstring>
#include <string>
#include <vector>
using namespace std;

const char yi[]  = "亿";
const char wan[] = "万";
const char* table[] ={
    "十","百","千",
};
const char* digits[] ={
    "零","一","二","三","四","五","六","七","八","九"
};
char* preprocess(char* psz){//错误处理，预处理
    if(psz==nullptr||*psz==0)return nullptr;
    size_t len = strlen(psz);
    size_t nIndex = 0;
    for(;nIndex<len;++nIndex){
        if(psz[nIndex]!='0')break;
    }
    if(nIndex==len-1)return nullptr;
    char* pret = psz+nIndex;//去掉了字符串前面的0串
    return pret;
}
size_t sum(const char*psz, size_t len){
    size_t ret=0;
    for(size_t s=0;s<len;++s){
        ret+=psz[s];
    }
    return ret;
}
string sRet="";//输出字符串
void addyi(size_t len){
    for(size_t n=0;n<len;++n){
        sRet+=yi;
    }
}
void processNdigits(const char*psz,size_t N){//处理4个数字,N<=4
	assert(N<=4);
    bool fZero=false;
    for(size_t s=0;s<N;++s){
        if(psz[s]=='0'){
            if(fZero==false){
                fZero=true;//连零的请款也只输出一个零
                if(s!=N-1)sRet+=digits[0];//个位的0不打印
            }
        }else{
            fZero=false;//清除0标记
            sRet+=digits[psz[s]-'0'];

			int index=N-2-s;			
			if(index>=0)
				sRet+=table[index];//加上十百千的单位
        }
    }
}
void process4digits(const char*psz){
	processNdigits(psz,4);
}
void process(char* psz){//处理算法
    char* s = preprocess(psz);
    if(s==nullptr){
		sRet="空";
        return;
	}

    size_t len = strlen(s);
    size_t nSection = len>>3; //按照"亿"分段
	char* pSection = s;
    for(size_t n=0;n<=nSection;++n){ //处理每个分段
        //首先计算当前段的长度和起始指针。除了第一个段长度可能比8少以外，其他都是8个字节
        size_t nSectionLen=(n==0)?(len%8):8;//当前段的长度
		if(nSectionLen==0)continue;

        //每个分段划分成前4个数字(万位)和后4个数字
		else if(nSectionLen==8){
			if(sum(pSection,4)==0){//如果万位全0，那么需要给万位自身输出一个'零'
				pSection+=4;
				sRet+=table[0];//输出'零'
			}else{//处理万位的4个数字并加上万
				process4digits(pSection);
				sRet+=wan;
			}

			pSection+=4;
			process4digits(pSection);//处理千位到个位的4个数字
		}else{//开始的一段数字，不够8位
			if(nSectionLen>4){
				size_t leadingLen = nSectionLen-4;
				processNdigits(pSection,leadingLen);
				nSectionLen=4;
				pSection+=leadingLen;
				sRet+=wan;
			}
			processNdigits(pSection,nSectionLen);
			pSection+=nSectionLen;
		}
        addyi(nSection-n);
    }
	cout<<sRet<<endl;
}

int main()
{
    char di[]="00001204567809001";
    process(di);
    return 0;
}

(3) 字符串编码/反编码的问题。游程编码是一类经典的问题。

#include"stdafx.h"
#include<Windows.h>
#include<iostream>
#include<vector>
using namespace std;
/*
 * 游程编码的问题:
 * 游程编码的输入可能是一个二进制串或者字符串，长度未知(因为可能很长)
 * 需要用"[内容-长度]对"的方式对其进行编码。例如0000111222编码成041323
 * 也就是0有4个，1有3个，2有3个，编码成041323
 * 需要注意的问题是:
 * (1)如果只有一个字符/一种字符的情况
 * (2)结束的情况
 * (3)重复个数很多的情况: 因为我们用固定的"一个字节"来表示长度
 * 因此每个"[内容-长度]对"表示最大长度就是255字节。
 * 更多的重复要用多个"[内容-长度]对"来表示
 */
int main()
{
    char arr[]="0000111222";
	struct encode{
		char c;
		BYTE length;
		encode(char _c,BYTE _length):
			c(_c),length(_length)
		{}
	};
	vector<encode> vEncode;
	char* psz=arr;
	char cCurrent = '\0';
	size_t nCount = 1;
	while(*psz!='\0'){
		cCurrent=*psz;
		if(nCount==255){
			vEncode.push_back(encode(cCurrent,(BYTE)nCount));
			nCount=1;
		}
		if(* ++psz ==cCurrent){
			++nCount;
			continue;
		}else{
			vEncode.push_back(encode(cCurrent,(BYTE)nCount));
			nCount=1;
		}
	}
	for( auto it = vEncode.begin(); it != vEncode.end(); ++ it ){
		cout<<it->c<<(int)it->length<<endl;
	}
    return 0;
}

    原文作者：游程编码问题
    原文地址: https://blog.csdn.net/lindan_40/article/details/13066299
    本文转自网络文章，转载此文章仅为分享知识，如有侵权，请联系博主进行删除。