Code Snippets for Implementation in the C Language

Creating temporary files and filenames for them:

C




temp = (FILE**)malloc(sizeof(FILE*) * runs);
filename = (char**)malloc(sizeof(char*) * runs);
for (i = 0; i < runs; i++) {
    filename[i] = (char*)malloc(sizeof(char) * 3);
}
i = 0;
while (i < runs - 1) {
    temp[i] = (FILE*)malloc(sizeof(FILE));
    temp[i] = openFile(filename1[i], "w+");
    writefile(mainFile, temp[i], main_mem);
    fclose(temp[i]);
    i++;
}
temp[i] = (FILE*)malloc(sizeof(FILE));
temp[i] = openFile(filename1[i], "w+");
writefile(mainFile, temp[i], lastrun);
fclose(temp[i]);


Writing data to the temporary file from the main file character by character:

C




void writefile(FILE* mainFile, FILE* tempFile, int numLines)
{
    char c;
    int i;
    for (i = 0; i < nline; i++) {
        fscanf(mainFile, "%c", &c);
        while (c != '\n') {
            fprintf(tempFile, "%c", c);
            fscanf(mainFile, "%c", &c);
        }
        fprintf(tempFile, "%c", c);
    }
}


Creating output temporary files and sorting the individual temporary files:

C




for (i = 0; i < runs - 1; i++) {
    out[i] = (FILE*)malloc(sizeof(FILE));
    out[i] = openFile(filename2[i], "w+");
    temp[i] = openFile(filename1[i], "r+");
    readLine(temp[i]);
    columnSeparate(main_mem, op);
    mergeSort(op, 0, main_mem - 1);
    free(l);
    free(b);
    free(d);
    fclose(temp[i]);
    fclose(out[i]);
}


ReadLine function:

C




void readLine(FILE* fp)
{
    int j = 0, i, f, g, n;
    char c;
    n = main_mem;
    l = (line*)malloc(sizeof(line) * n);
    b = (char**)malloc(sizeof(char*) * (n));
    for (i = 0; i < n; i++) {
        b[i] = (char*)malloc(sizeof(char) * 256);
    }
    d = (char**)malloc(sizeof(char*) * (n));
    for (i = 0; i < n; i++) {
        d[i] = (char*)malloc(sizeof(char) * 256);
    }
    for (i = 0; i < n; i++) {
        l[i].temp = (char*)malloc(sizeof(char) * 256);
        g = fscanf(fp, "%c", &l[i].temp[j]);
        while (l[i].temp[j] != '\n') {
            j++;
            f = fscanf(fp, "%c", &(l[i].temp[j]));
        }
        l[i].temp[j] = '\0';
        j = 0;
    }
}


Column Separate function:

C




void columnSeparate(int lines, opt op)
{
    char tem[256], str[256];
    int i = 0, j 0, count, k = 0, len;
    int col;
    if (op.keyno == 1) {
        for (i = 0; i < lines; i++) {
            len = strlen(l[i].temp);
            while (j < len) {
                tem[k] = l[i].temp[j];
                k++;
                j++;
                if (l[i].temp[j] == '\0') {
                    break;
                }
            }
            tem[k] = '\0';
            j = 0, k = 0;
            l[i].col = (char*)malloc(sizeof(char) * 256);
            strcpy(l[i].col, tem);
            strcpy(tem, "");
        }
    }
    else {
        for (i = 0; i < lines; i++) {
            len = strlen(l[i].temp);
            while (j < (len - 1)) {
                if (l[i].temp[j] == ' '
                    && l[i].temp[j + 1] != ' '
                    && count == 0) {
                    col++;
                    if (col == op.keyno) {
                        count = 1;
                        break;
                    }
                }
                else
                    j++;
            }
            j++;
            while (l[i].temp[j] != '\0'
                   && l[i].temp[j] != ' ' && count == 1) {
                tem[k] = l[i].temp[j];
                k++;
                j++;
            }
            if (count == 1) {
                tem[k] = '\0';
                j = k = col = count = 0;
                l[i].col
                    = (char*)malloc(sizeof(char) * 256);
                strcpy(l[i].col, str);
                strcpy(str, "");
            }
        }
    }
}


Merging the sorted files using Heap functions and storing them in one output file:

C




void mergeSortedFiles(int n, int k, opt op)
{
    FILE **out, *fp;
    char **filename, str[256];
    out = (FILE**)malloc(sizeof(FILE*) * k);
    filename = (char**)malloc(sizeof(char*) * k);
    data d;
    int no, temp[k], i, j = 0, count = 0;
    heapinit(&h);
    fp = openFile("sorted.txt", "w+");
    for (i = 0; i < k; i++) {
        filename[i] = (char*)malloc(sizeof(char) * 5);
        sprintf(filename[i], "%d", i + 500);
    }
    for (i = 0; i < k; i++) {
        out[i] = (FILE*)malloc(sizeof(FILE));
        out[i] = openFile(filename[i], "r+");
    }
    for (i = 0; i < k; i++) {
        l = (line*)malloc(sizeof(line));
        l[0].temp = (char*)malloc(sizeof(char) * 256);
        j = 0;
        fscanf(out[i], "%c", &l[0].temp[j]);
        while (l[0].temp[j] != '\n') {
            j++;
            fscanf(out[i], "%c", &l[0].temp[j]);
        }
        l[0].temp[j] = '\0';
        columnseperate(1, op);
        heapinsert(&h, l[0].col, i, op, l[0].temp);
        free(l);
    }
    while (count < k) {
        if (!heapisempty(&h)) {
            d = heapremove(&h);
            strcpy(str, d.a);
            fprintf(fp, "%s", str);
            fprintf(fp, "%c", '\n');
        }
        else
            return;
        if (!(feof(out[d.fileno]))) {
            l = (line*)malloc(sizeof(line));
            l[0].temp = (char*)malloc(sizeof(char) * 256);
            j = 0;
            fscanf(out[d.fileno], "%c", &l[0].temp[j]);
            if (!feof(out[d.fileno])) {
                while (l[0].temp[j] != '\n') {
                    j++;
                    fscanf(out[d.fileno], "%c",
                           &l[0].temp[j]);
                }
                l[0].temp[j] = '\0';
                columnseperate(1, op);
                heapinsert(&h, l[0].col, d.fileno, op,
                           l[0].temp);
                free(l);
            }
            else {
                count++;
                fclose(out[d.fileno]);
            }
        }
    }
}


Combining these atomic functions, creating the required heap and line structures, writing appropriate functions implementing heap methods, and writing the main method with all the options and calling functions completes the implementation of the Linux sort command. 

Output:

Sorting with -n option. Left Side – Input file | Right Side – Output file

Sorting with the -k1 option. Left Side – Input file | Right Side – Output file

Running the code for both examples

Above are the snapshots of running the executable file of the implementation for two different options and files.  



Internal Implementation of Linux Sort Command

The SORT command in the Linux operating system is used for sorting. It can be done either for files or for input on a command line given by the user. The sort command by default sorts files under the assumption that their data is ASCII. It sorts the files line by line.

The SORT command follows a variety of features for the output. The first is that the alphabetical lines will follow the lines with numbers. Lines containing lowercase letters will appear before lines containing the same character in uppercase. The SORT command has several options for sorting in various ways. Some of them include:

  • -k: This option sorts the file based on the key number, which is stated in the option after k. For example, the “-k2” option would sort the file using the second column of the file.
  • -n: This option sorts the file numerically since the default behavior is ASCII based.
  • -b: This option ignores the leading blanks in the file while sorting.
  • -d: This option sorts according to the dictionary order.
  • -r: This option does a reverse sort or reverses the sorted result.
  • -m: This option merges the already sorted files given as input.
  • -u: This option removes duplicates in the file and then sorts it.
     

The SORT command can be used to sort large files that cannot fit in the main memory and thus reside in the external memory using the mechanism of external sorting, which is a class of sorting algorithms that can handle enormous volumes of data. The algorithm can be more specifically termed External Merge Sorting, and it essentially operates by first merging the sorted chunks together after it sorts the chunks into groups that can all fit in RAM. 

Simply put, first divide the file into runs that fit in the main memory, then sort each run in the main memory, and finally merge the sorted runs into the external memory.

External Sorting

Similar Reads

Steps for Internal Implementation of Linux Sort Command

The implementation of the SORT command differs a little on the basis of the option mentioned, but the primary steps remain the same. The implementation uses the external merge sorting algorithm as mentioned above. The steps for the implementation are as follows:...

Code Snippets for Implementation in the C Language

Creating temporary files and filenames for them:...