将超大文件里的数据按行排序, 并消除重复行
首先将超大文件拆分成小文件,然后对小文件进行去重操作和排序操作
每个小文件排序完成后,将每个小文件的第一行拿来排序,然后将最小的存入外部存储空间,再放入拿掉第一个元素的小文件的第二个元素,以此类推
拆分小文件: 假设数据有1G, 内存为10M, 每条数据大小1KB,那么每个文件里的数据有10M/1KB = 1024条, 一共有103个小文件
import time
def splitfile(file_path, dest_folder):
block_size = 1024
file = open('file_path', 'r')
line = file.readline()
count_file = 1
temp = []
while line:
for i in range(block_size):
if i == (block_size -1 ):
file_write = open(dest_folder + "\\file_" + str(count_file) + '.txt', 'a+')
#a+: 以追加读写方式打开,但具有a模式的特点,即:如果文件不存在,则创建;如果文件存在,
#则不清空,同时将文件指针跳到当前文件内容的末尾(EOF处),准备追加写
file_write.writelines(temp)
file_write.close()
temp = []
print("file" + str(count_file) + "generated at: " + str(ctime()))
count_file += 1
else:
temp.append(line)
file.close()
if __name__ == '__main__':
print("Started at: " + str(ctime()) + split_file("path1", "path2")
分小文件内部排序 然后对一个,遍历其他,交换小的的到这个里头,一遍后这个就是最小块,保存,换第二块继续
import fileinput
file_count = 103
def get_sorted_uni_file():
for file_count in range(1, file_count+1): #每个小文件
small_file = open(dest_folder + "\\file" + str(file_count) + '.txt', 'r')
small_file.close()
lines = sort(list(set(small_file.readlines()))) #每个小文件排序去重,返回一个list
<span style="white-space:pre"> </span>lines = [line + '\n' for line in lines]
small_file = open(dest_folder + "\\file" + str(file_count) + '.txt', 'w')
small_file.writelines(lines) #将list里的元素逐一写回每个小文件
small_file.close()
def sort_whole():
base_file = open(massive_folder+ "\\file"+ '.txt', 'a+')
base_lines = base_file.readlines()
#遍历每个每个小文件的第一个元素
#分别取出其余文件的最小元素与base_file里的比较大小,将大的那个逐一放回其余文件,小的那个留在base_file里
#得到最小的1024个元素
for file_count in range(1, file_count+1):
cmp_file = open(dest_folder + "\\file" + str(i + file_count) + '.txt', 'a+')
cmp_lines = cmp_file.readlines()
first_line = cmp_lines.pop(0)
base_lines.append(first_line).sort()
cmp_lines.insert(0, lines[-1])
base_file.writelines(base_lines)
将log中MAC地址提取出来
使用正则表达式,MAC地址的范围是 0000 0000 0001 – FFFF FFFF FFFD, 以” :“ ” – “隔开
re.findall( ^([0-9a-fA-F]{2}) ([/\s:-][0-9a-fA-F]{2}{5})$)
^文件开始 $文件结束 []表达式 {}出现次数 \下一个字符是原义字符或者转义字符
取出数据库表T里第n行到m行的数据
SELECT * FROM T LIMIT n – 1, m-n 查询一条记录($id)的下一条记录
SELECT * FROM T id > $id ORDER BY id ASC DLIMT 1
从表pat_info, pat_drug中,统计出drug_cost的和,pat_code,drug_code,并以drug_code排序
SELECT SUM(drug_cost), pat_code, drug_code
FROM pat_info, pat_drug
WHERE pat_info.pat_code = pat_drug.pat_code
GROUP BY pat_drug.pat_code
ORDER BY pat_drug.drug_cost
如何判断一个二叉树B是不是二叉树A的子树?
比如:
2
/ \
9 8
/ \ /
2 3 5
/
6
有个子结构是 9 / \ 2 3
bool HasSubtree(TreeNode* pTreeHead1, TreeNode* pTreeHead2)
{
if((pTreeHead1 == NULL && pTreeHead2 != NULL) ||
(pTreeHead1 != NULL && pTreeHead2 == NULL))
return false;
if(pTreeHead1 == NULL && pTreeHead2 == NULL)
return true;
return HasSubtreeCore(pTreeHead1, pTreeHead2);
}
<pre name="code" class="cpp">bool HasSubtreeCore(TreeNode* pTreeHead1, TreeNode* pTreeHead2)
{
bool result = false;
if(pTreeHead1->m_nValue == pTreeHead2->m_nValue)
{
result = DoesTree1HaveAllNodesOfTree2(pTreeHead1, pTreeHead2);
}
if(!result && pTreeHead1->m_pLeft != NULL)
result = HasSubtreeCore(pTreeHead1->m_pLeft, pTreeHead2);
if(!result && pTreeHead1->m_pRight != NULL)
result = HasSubtreeCore(pTreeHead1->m_pRight, pTreeHead2);
return result;
}
bool DoesTree1HaveAllNodesOfTree2(TreeNode* pTreeHead1, TreeNode* pTreeHead2)
{
if(pTreeHead2 == NULL)
return true;
if(pTreeHead1 == NULL)
return false;
if(pTreeHead1->m_nValue != pTreeHead2->m_nValue)
return false;
return DoesTree1HaveAllNodesOfTree2(pTreeHead1->m_pLeft, pTreeHead2->m_pLeft) &&
DoesTree1HaveAllNodesOfTree2(pTreeHead1->m_pRight, pTreeHead2->m_pRight);
}