几种文本相似度算法的C++实现

1、最小编辑距离

namespace levenshtein
{
bool compare_char_(char c1, char c2)
{
	return c1 == c2;
}

size_t ins_(char c)
{
	return 1;
}

size_t del_(char c)
{
	return 1;
}

size_t sub_(char c1, char c2)
{
	return compare_char_(c1, c2) ? 0 : 2;
}

size_t compare_(const std::string& ref_s, const std::string& ref_l)
{
	size_t len_s = ref_s.length();
	size_t len_l = ref_l.length();

	size_t** distance = new size_t*[len_s + 1];
	for(size_t i = 0; i < len_s + 1; ++i)
	{
		distance[i] = new size_t[len_l + 1];
	}

	distance[0][0] = 0;

	for(size_t i = 1; i < len_s + 1; ++i)
	{
		distance[i][0] = distance[i - 1][0] + del_(ref_s.at(i - 1));
	}

	for(size_t i = 1; i < len_l + 1; ++i)
	{
		distance[0][i] = distance[0][i - 1] + ins_(ref_l.at(i - 1));
	}

	for(size_t i = 1; i < len_s + 1; ++i)
	{
		for(size_t j = 1; j < len_l + 1; ++j)
		{
			size_t ins = distance[i][j - 1] + ins_(ref_l.at(j - 1));
			size_t del = distance[i - 1][j] + del_(ref_s.at(i - 1));
			size_t sub = distance[i - 1][j - 1] + sub_(ref_s.at(i - 1), ref_l.at(j - 1));

			distance[i][j] = std::min(std::min(ins, del), sub);
		}
	}

	return distance[len_s][len_l];
}

float compare(const std::string& ref1, const std::string& ref2)
{
	if(ref1.empty() && ref2.empty())
	{
		return 1;
	}

	size_t distance = 0;
	size_t len = 0;

	if(ref1.length() < ref2.length())
	{
		distance = compare_(ref1, ref2);
		len = ref2.length();
	}
	else
	{
		distance = compare_(ref2, ref1);
		len = ref1.length();
	}

	return distance < len ? 1 - static_cast<float>(distance) / len : 0;
}
}	//levenshtein

2、余弦定理

namespace cosine
{
bool word_segment_(const std::string& substr)
{
	return true;
}

float compare(const std::string& ref1, const std::string& ref2)
{
	std::map<std::string, std::pair<size_t, size_t>> container;

	for(size_t i = 0, start = 0; i < ref1.length(); ++i)
	{
		std::string substr = ref1.substr(start, i - start + 1);
		if(word_segment_(substr))
		{
			++container[substr].first;
			start = i + 1;
		}
	}

	for(size_t i = 0, start = 0; i < ref2.length(); ++i)
	{
		std::string substr = ref2.substr(start, i - start + 1);
		if(word_segment_(substr))
		{
			++container[substr].second;
			start = i + 1;
		}
	}

	unsigned long product = 0;
	unsigned long modulo1 = 0;
	unsigned long modulo2 = 0;

	for(std::map<std::string, std::pair<size_t, size_t>>::const_iterator it = container.begin(); it != container.end(); ++it)
	{
		const std::pair<size_t, size_t>& cnt = it->second;
		product += cnt.first * cnt.second;
		modulo1 += cnt.first * cnt.first;
		modulo2 += cnt.second * cnt.second;
	}

	return product / (std::sqrt(static_cast<float>(modulo1)) * std::sqrt(static_cast<float>(modulo2)));
}
}	//cosine

点赞