#ifndef _TRIE_H_
#define _TRIE_H_
struct trie_node
{
int value;
int eof;/* the tag of end*/
struct trie_node *child,*sibling;
};
extern void init_index();
extern int create_index(FILE *findex);
extern void search_index(FILE *fin,FILE *fout);
extern void free_index();
#endif
#include <stdio.h>
#include <stdlib.h>
#include “trie.h”
#define ascii_char(ch) (ch<0x7f?1:0)
#define HASH_SIZE 65536
#define BUFFER_SIZE 256/*assume the max CJK word is 128*/
/*here I don’t read the in file into memory then analyze it*/
/*because it is hard to assure the size of in file*/
/*and the operate system like WIN32 have done the job of reading the file block*/
/*into memory and we just have few times to using fseek to back in the file*/
static struct trie_node hash[HASH_SIZE];
/*init the hash table*/
extern void init_index()
{
int i;
struct trie_node *tmp;
for(i=0;i<HASH_SIZE;i++)
{
tmp=hash+i;
tmp->eof=0;
tmp->value=0;
tmp->child=NULL;
tmp->sibling=NULL;
}
}
/*malloc a new node for stroe a CJK*/
/*be sure the eof=0,stand for non stop here*/
static struct trie_node * new_trie_node(int value)
{
struct trie_node *tmp;
tmp=(struct trie_node*)malloc(sizeof(struct trie_node));
tmp->value=value;
tmp->eof=0;
tmp->child=NULL;
tmp->sibling=NULL;
return tmp;
}
/*create the index*/
/*read the file*/
extern int create_index(FILE *findex)
{
int value;
unsigned char ch;
struct trie_node *head,*tmp;
while((ch=fgetc(findex))!=0xff)/*at the eof*/
{
if(ascii_char(ch))
return 0;
value=ch;
ch=fgetc(findex);
value=(value<<8)+ch;
head=hash+value;/*locate the hash position first*/
/*head ,right*/
while((ch=fgetc(findex))!=’\n’)/*if char=’\n’,stanf for a new word*/
/*if that occurs,sorry,we have to create once again*/
{
if(ascii_char(ch))
return 0;
value=ch;
ch=fgetc(findex);
value=(value<<8)+ch;
/*here exits three condition*/
/*head->child=NULL,so we have to create a new node*/
/*head->child->value=value*/
/*head->child->value!=value,so we have to find in the sibling*/
if(head->child==NULL)
{
head->child=new_trie_node(value);
head=head->child;
}
else if(head->child->value==value)/*if exist*/
head=head->child;/*go as the child goes*/
else
{
head=head->child;
tmp=head->sibling;
while(tmp)
{
if(tmp->value==value)
break;
head=tmp;
tmp=tmp->sibling;
}
if(tmp)/*if exist*/
head=tmp;/*save the head*/
else
{
head->sibling=new_trie_node(value);
head=head->sibling;
}
}
}
head->eof=1;/*when a word insert into it*/
/*we have to update the tag of eof*/
}
return 1;/*insert succefully*/
}
static void free_trie(struct trie_node *head)
{
struct trie_node *p1,*p2;
if(head==NULL)
return ;
else
{
p1=head->child;
if(!p1)
return ;
else
{
while(p1)
{
p2=p1->sibling;
free_trie(p1);
p1=p2;
}
}
free(head);
}
return ;
}
extern void free_index()
{
int i;
struct trie_node *head,*p1,*p2;
for(i=0;i<HASH_SIZE;i++)
{
head=hash+i;
p1=head->child;
if(!p1)
continue;
else
{
while(p1)
{
p2=p1->sibling;
free_trie(p1);
p1=p2;
}
}
}
return ;
}
static void output_buffer(FILE *fin,FILE *fout,unsigned char *buffer,\
int last_match_offset,int cur_file_offset)
{
/*here we have to putput the buffer*/
/*and reseek the infile*/
int i;
for(i=0;i<last_match_offset;i++)
fputc(*(buffer+i),fout);
fputc(‘\n’,fout);
fseek(fin,last_match_offset-cur_file_offset,SEEK_CUR);
/*the file pos has to back a distance*/
return ;
}
extern void search_index(FILE *fin,FILE *fout)
{
/*last_match_offset:the last offset come across the tag of eof*/
/*cur_buffer_offset:record the pos of buffer*/
/*cur_file_offset:record the pos of file,because we have to back in the file*/
int last_match_offset,\
cur_buffer_offset,\
cur_file_offset;
/*sometimes*/
unsigned char ch[2];
int value;
unsigned char buffer[BUFFER_SIZE]={0};
struct trie_node *head;
while((ch[0]=fgetc(fin))!=0xff)
{
last_match_offset=0;
cur_buffer_offset=0;
cur_file_offset=0;
head=NULL;
if(ascii_char(ch[0]))
{
output_buffer(fin,fout,buffer,\
last_match_offset,cur_file_offset);
fputc(ch[0],fout);
fputc(‘\n’,fout);
continue;
}
buffer[cur_buffer_offset++]=ch[0];
value=ch[0];
ch[1]=fgetc(fin);
buffer[cur_buffer_offset++]=ch[1];
value=(value<<8)+ch[1];
last_match_offset+=2;/*fetch two char*/
cur_file_offset+=2;/*fetch two char*/
head=hash+value;
while((ch[0]=fgetc(fin))!=0xff)
{
if(ascii_char(ch[0]))
{
output_buffer(fin,fout,buffer,\
last_match_offset,cur_file_offset);
fputc(ch[0],fout);
fputc(‘\n’,fout);
break;
}
ch[1]=fgetc(fin);
value=(ch[0]<<8)+ch[1];
cur_file_offset+=2;
if(head->child==NULL)/*of course this is not match*/
{
output_buffer(fin,fout,buffer,\
last_match_offset,cur_file_offset);
break;/*match noce again*/
}
head=head->child;
while(head)
{
if(head->value==value)/*match*/
{
buffer[cur_buffer_offset++]=ch[0];
buffer[cur_buffer_offset++]=ch[1];
if(head->eof)/*max length,has to update the*/
/*last_match_offset*/
last_match_offset=cur_buffer_offset;
break;
}
head=head->sibling;
}
if(!head)/*if not match,match once again as one word*/
{
output_buffer(fin,fout,buffer,\
last_match_offset,cur_file_offset);
break;
}
}
}
return ;
}
#include <stdio.h>
#include <stdlib.h>
#include “trie.h”
int main()
{
FILE *findex,*fin,*fout;
findex=fopen(“vocabulary.txt”,”r”);
/*in
it the index first*/
/*the file format should be like this*/
/* ********* (‘\n’) */
/* ********* (‘\n’) */
/* ……………. */
/* ********* (‘\n’) */
/*be sure at the end of file,there should be a crlf*/
init_index();
if((findex==NULL)||(create_index(findex)==0))
{
printf(“insert index error\n”);
getchar();
exit(1);
}
fclose(findex);
printf(“OK\n”);
fin=fopen(“in.txt”,”r”);
fout=fopen(“out.txt”,”w+”);
if(fin==NULL)
{
printf(“search index error\n”);
getchar();
exit(1);
}
search_index(fin,fout);
printf(“OK\n”);
free_index();
fclose(fin);
fclose(fout);
getchar();
return 0;
}