Merging and sorting sorted sources


This neat little algorithm takes a number of sorted files and merges them together to a sorted file. It uses fixed block sizes (predefined 16).

It basically has a “slot” for each file, you may think a queue of the blocks in this file, and tries to work its way through to the end of all queues. So it takes from the slot with the smallest block, writes it to the output file, and refills the slot with the next queue element from that file.
It is a little more sophisticated, because instead of looking through all slots for the next smallest block, it actually juggles the slots (files) in a sorted manner.
I will definitely reuse this.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
#include <stdio.h>
#include <signal.h>
#include <stdlib.h>
#include <assert.h>
#include <string.h>
 
#define outputfile "mergedblocks"
#ifndef DEBUG
#define DEBUG 0
#endif
 
#define ENTRYSIZE 16
 
FILE * f;
 
int comp(const char * a, const char * b) {
	int i;
	for (i = 0; i < ENTRYSIZE; i++) {
		if (a[i] != b[i])
			return a[i] - b[i];
	}
	return 0;
}
void switchentries(char * a, char * b) {
	int i; 
	char t;
	for(i=0; i<ENTRYSIZE; i++) {
		t = a[i];
		a[i] = b[i];
		b[i] = t;
	}
}
 
 
int main(int argc, char ** argv)
{
	int i;
	int j;
	int nfiles;
	int v;
 
	char ** filenames;
	FILE ** files;
	FILE * outfile;
	char ** data;
	unsigned long * nentries;
	int * slot2file;
 
	assert(argc > 1);
	filenames = argv + 1;
	nfiles = argc - 1;
 
	files = (FILE **)calloc(nfiles, sizeof(FILE*));
	assert(files != NULL);
	data = (char **) calloc(nfiles, sizeof(char*));
	assert(data != NULL);
	nentries = (unsigned long *) calloc(nfiles, sizeof(unsigned long));
	assert(nentries != NULL);
	slot2file = (int *) calloc(nfiles, sizeof(int));
	assert(slot2file != NULL);
	for (i = 0; i < nfiles; i++) {
		files[i] = fopen(filenames[i], "r");
		assert(files[i] != NULL);
		nentries[i] = 0;
		slot2file[i] = i;
		data[i] = (char*) malloc(sizeof(char) * ENTRYSIZE);
		assert(data[i] != NULL);
		memset((void*)data[i], 0, ENTRYSIZE);
 
		/* load first entry */
		for (j = 0; j < ENTRYSIZE; j++) {
			v = getc(files[i]);
			assert(v != EOF);
			data[i][j] = v;
		}
		nentries[i]++;
		/* find next fitting */
		for(j = i - 1; j >= 0 && comp(data[j], data[j + 1]) > 0; j--) {
			if(DEBUG)
			printf("switching %d <-> %d\n", j, j+1);
			switchentries(data[j+1], data[j]);
			v = slot2file[j+1];
			slot2file[j+1] = slot2file[j];
			slot2file[j] = v;
		}
	}
	outfile = fopen(outputfile, "w");
	assert(outfile != NULL);
 
	while(1) {
		if(DEBUG)
		for (i = 0; i < nfiles; i++) {
			printf("slot %3d: ", i);
			for(j = 0; j < ENTRYSIZE; j++) {
				printf("%02x", (unsigned char)data[i][j]);
			}
			j = slot2file[i];
			if (j < 0) {
				printf(" (ended)");
				j = (j + 1)*-1;
			}
			printf("\t%d->%d: '%s'", i, j, filenames[j]);
			printf("\n");
		}
		if(slot2file[0] < 0) {
			break;
		}
		/* smallest is at 0 */
		/* write out smallest */
		if (slot2file[1] >= 0 && comp(data[0], data[1]) == 0) {
			printf("duplicate found\n");
			printf("file %s, item %lu\n", filenames[slot2file[0]], nentries[slot2file[0]]);
			printf("file %s, item %lu\n", filenames[slot2file[1]], nentries[slot2file[1]]);
			for(j = 0; j < ENTRYSIZE; j++) {
				printf("%02x", (unsigned char)data[0][j]);
			}
			printf("\n");
		} else {
			if(DEBUG)
			printf("smallest writeout\n");
			for (j = 0; j < ENTRYSIZE; j++) {
				putc(data[0][j], outfile);
			}
		}
 
		if(DEBUG)
		printf("refill %s\n", filenames[slot2file[0]]);
		/* refill slot if possible */
		for (j = 0; j < ENTRYSIZE; j++) {
			v = getc(files[slot2file[0]]);
			if (v == EOF) {
				if(DEBUG)
				printf("end of file\n");
				slot2file[0] = -slot2file[0] - 1;
				break;
			}
			data[0][j] = v;
		}
		if (slot2file[0] >= 0) {
			nentries[slot2file[0]]++;
		}else{ /* push to the end */
			if(DEBUG)
			printf("moving to the end\n");
			for(j = 1; j < nfiles && slot2file[j] >= 0; j++) {
				i = j - 1;
				if(DEBUG)
				printf("switching %d <-> %d\n", i, j);
				switchentries(data[i], data[j]);
				v = slot2file[i];
				slot2file[i] = slot2file[j];
				slot2file[j] = v;
			}
 
		}
		/* put in right place (bubblesort for 1 unsorted item at 0) */
		if(DEBUG)
		printf("sorting\n");
		for(j = 1; j < nfiles && slot2file[j] >= 0;j++){
			i = j - 1;
			if(comp(data[i], data[j]) > 0) {
				if(DEBUG)
				printf("switching %d <-> %d\n", i, j);
				switchentries(data[i], data[j]);
				v = slot2file[i];
				slot2file[i] = slot2file[j];
				slot2file[j] = v;
			}else{
				break;
			}
		}
	}
 
 
	for (i = 0; i < nfiles; i++) {
		slot2file[i] = -(slot2file[i] + 1);
		fclose(files[i]);
		if (i>0)
			nentries[0] += nentries[i];
	}
	fclose(outfile);
	printf("mergesorted %d files (%lu entries), wrote to %s.\n", 
		nfiles, nentries[0], outputfile);
	/*free(data);*/
 
	return 0;
}

  1. #1 by panzi on August 18th, 2009

    Hab noch nicht weiter als zur comp Funktion gelesen, aber warum das Rad neu erfinden?

    #include
    #define comp(s1,s2) memcmp(s1,s2,ENTRYSIZE)
    
  2. #2 by admin on August 19th, 2009

    Guter Hinweis, im Spezialfall dass man nach bits sortieren möchte kann man memcmp verwenden. Kommt natürlich darauf an was die Bedeutung der entries ist.
    Preisfrage: Was unterscheidet die Funktion comp() hier und memcmp()? Es gibt nämlich einen, den man nicht gleich sieht.

    Good hint, in the special case of just sorting by bits you can use memcmp. It obviously depends on how you want to compare the entries.
    Trick question: What is the difference between the function comp() here and memcmp()? There is a non-obvious one.

(will not be published)

  1. No trackbacks yet.