Source code for subgroups.utils.file_format_transformations

# -*- coding: utf-8 -*-

# Contributors:
#    Antonio López Martínez-Carrasco <antoniolopezmc1995@gmail.com>

"""This file contains the implementation of different functions used to transform the resulting files obtained by the algorithms.
"""

from re import compile

_regex_pattern = "^(?P<subgroup>.+) ; Sequence of instances tp = bitarray\\('(?P<positive_bitset>[01]+)'\\) ; Sequence of instances fp = bitarray\\('(?P<negative_bitset>[01]+)'\\) ; Quality Measure.+$"
_regex_object = compile(_regex_pattern)

[docs] def to_input_format_for_subgroup_list_algorithms(original_file_path : str, transformed_file_path : str) -> tuple[int, int]: """Method to transform the format of a file generated by a traditional SD algorithm (that mines a subgroup set) to the the input file format of the algorithms that mine subgroup lists. :param original_file_path: path of the original file. :param transformed_file_path: path of the transformed file. :return: a 2-tuple of the form: (number of subgroups correctly read, number of subgroups not correctly read). """ input_file = open(original_file_path, "r") output_file = open(transformed_file_path, "w") line_number = 1 num_of_errors = 0 for line in input_file: match_object = _regex_object.fullmatch(line.rstrip("\n")) if match_object: positive_bitset = match_object.group("positive_bitset") negative_bitset = match_object.group("negative_bitset") output_file.write(match_object.group("subgroup") + " ; " + positive_bitset + " ; " + negative_bitset + "\n") else: output_file.write("ERROR: subgroup in line " + str(line_number) + " was not correctly read.\n") num_of_errors = num_of_errors + 1 line_number = line_number + 1 input_file.close() output_file.close() return (line_number-1-num_of_errors, num_of_errors)