Source code for subgroups.utils.file_format_transformations

# -*- coding: utf-8 -*-

# Contributors:
#    Antonio López Martínez-Carrasco <antoniolopezmc1995@gmail.com>

"""This file contains the implementation of different functions used to transform the resulting files obtained by the algorithms.
"""

from re import compile

_regex_pattern = "^(?P<subgroup>.+) ; Sequence of instances tp = bitarray\\('(?P<positive_bitset>[01]+)'\\) ; Sequence of instances fp = bitarray\\('(?P<negative_bitset>[01]+)'\\) ; Quality Measure.+$"
_regex_object = compile(_regex_pattern)


[docs]
def to_input_format_for_subgroup_list_algorithms(original_file_path : str, transformed_file_path : str) -> tuple[int, int]:
    """Method to transform the format of a file generated by a traditional SD algorithm (that mines a subgroup set) to the the input file format of the algorithms that mine subgroup lists. 
    
    :param original_file_path: path of the original file.
    :param transformed_file_path: path of the transformed file.
    :return: a 2-tuple of the form: (number of subgroups correctly read, number of subgroups not correctly read).
    """
    input_file = open(original_file_path, "r")
    output_file = open(transformed_file_path, "w")
    line_number = 1
    num_of_errors = 0
    for line in input_file:
        match_object = _regex_object.fullmatch(line.rstrip("\n"))
        if match_object:
            positive_bitset = match_object.group("positive_bitset")
            negative_bitset = match_object.group("negative_bitset")
            output_file.write(match_object.group("subgroup") + " ; " + positive_bitset + " ; " + negative_bitset + "\n")
        else:
            output_file.write("ERROR: subgroup in line " + str(line_number) + " was not correctly read.\n")
            num_of_errors = num_of_errors + 1
        line_number = line_number + 1
    input_file.close()
    output_file.close()
    return (line_number-1-num_of_errors, num_of_errors)