Source code for subgroups.utils.file_format_transformations
# -*- coding: utf-8 -*-
# Contributors:
# Antonio López Martínez-Carrasco <antoniolopezmc1995@gmail.com>
"""This file contains the implementation of different functions used to transform the resulting files obtained by the algorithms.
"""
from re import compile
_regex_pattern = "^(?P<subgroup>.+) ; Sequence of instances tp = bitarray\\('(?P<positive_bitset>[01]+)'\\) ; Sequence of instances fp = bitarray\\('(?P<negative_bitset>[01]+)'\\) ; Quality Measure.+$"
_regex_object = compile(_regex_pattern)
[docs]
def to_input_format_for_subgroup_list_algorithms(original_file_path : str, transformed_file_path : str) -> tuple[int, int]:
"""Method to transform the format of a file generated by a traditional SD algorithm (that mines a subgroup set) to the the input file format of the algorithms that mine subgroup lists.
:param original_file_path: path of the original file.
:param transformed_file_path: path of the transformed file.
:return: a 2-tuple of the form: (number of subgroups correctly read, number of subgroups not correctly read).
"""
input_file = open(original_file_path, "r")
output_file = open(transformed_file_path, "w")
line_number = 1
num_of_errors = 0
for line in input_file:
match_object = _regex_object.fullmatch(line.rstrip("\n"))
if match_object:
positive_bitset = match_object.group("positive_bitset")
negative_bitset = match_object.group("negative_bitset")
output_file.write(match_object.group("subgroup") + " ; " + positive_bitset + " ; " + negative_bitset + "\n")
else:
output_file.write("ERROR: subgroup in line " + str(line_number) + " was not correctly read.\n")
num_of_errors = num_of_errors + 1
line_number = line_number + 1
input_file.close()
output_file.close()
return (line_number-1-num_of_errors, num_of_errors)