Source code for procars.utils.utils_IO

#!/usr/bin/env python
# -*- coding: utf-8 -*-:

Copyright © Bonsai - LIFL (Université Lille 1, CNRS UMR 8022) and Inria-Lille Nord Europe


This software is a computer program whose purpose is to progressively reconstruct ancestral
gene orders.

This software is governed by the CeCILL-B license under French law and
abiding by the rules of distribution of free software. You can use,
modify and/or redistribute the software under the terms of the CeCILL-B
license as circulated by CEA, CNRS and Inria at the following URL, or in the LICENCE file at the root directory of this program.

The fact that you are presently reading this means that you have had
knowledge of the CeCILL-B license and that you accept its terms.


``utils_IO`` **module description**:

This module contains some methods used by the other modules when they need to read/write into files.

.. moduleauthor:: Aïda Ouangraoua, Amandine PERRIN

May 2014


    import cPickle as pickle
except ImportError:
    import pickle
from procars.utils import util_adjacency_functions

[docs]def save_binary_information(bin_filename, information): """ Saving information into a binary file Parameters ---------- bin_filename : String Name of the binary file in which saving the CARs information information : list Information we need to save into a binary file """ with open(bin_filename, "wb") as binary_file: my_pickler = pickle.Pickler(binary_file) my_pickler.dump(information)
[docs]def read_binary_file(bin_filename): """ Reading information stored into a binary file Parameters ---------- bin_filename : string Name of the file in which reading information Returns ------- tuple information stored in the file """ with open(bin_filename, "rb") as binfile: my_unpickler = pickle.Unpickler(binfile) information = my_unpickler.load() return information
[docs]def read_adjacency_file(adjacency_file_name, nb_blocks, discarded=False): """ Read an adjacency file (or a discarded adjacency file), and find all blocks and their left and right neighbors used in ``compute_pqtree`` and ``resolve_conflict`` Parameters ---------- adjacency_file_name : string Name of the file containing the adjacencies to parse nb_blocks : int Total number of blocks discarded : boolean True if we are reading a file of discarded adjacencies (and hence there can be multiple left/right neighbors), False for a file of added adjacencies (one left/right neighbor per block_end) Returns ------- tuple *left:* dict with: - if not discarded: for each block number, its left neighbor: left[bloc2] = bloc1 - if discarded: for each block number, an array containing its potential left neighbors: left[bloc1] = [bloc2, -bloc3,..] *right:* dict with: - if not discarded: for each block number, its right neighbor: right[bloc1] = bloc2 - if discarded: for each block number, an array containing its potential right neighbors: right[bloc1] = [-bloc2, bloc3,..] """ left = {block_id: [] for block_id in range(1, nb_blocks + 1)} right = {block_id: [] for block_id in range(1, nb_blocks + 1)} with open(adjacency_file_name, "r") as adjacency_lines: for line in adjacency_lines: adjacency = line.split() signed_block1 = int(adjacency[0]) signed_block2 = int(adjacency[1]) if(signed_block1 > 0): right[signed_block1].append(signed_block2) else: left[-signed_block1].append(-signed_block2) if(signed_block2 > 0): left[signed_block2].append(signed_block1) else: right[-signed_block2].append(-signed_block1) if not discarded: left = {(key): (value[0] if value else 0) for key, value in left.iteritems()} right = {(key): (value[0] if value else 0) for key, value in right.iteritems()} return left, right
[docs]def read_car_file(car_file_name, nb_blocks): """ Function reading a CAR file Parameters ---------- car_file_name : string Name of the file containing the current PQtree nb_blocks : int Total number of blocks Returns ------- tuple *cars:* array of arrays (cars) *block_to_car:* integer array such that block_to_car[block_id] = car_id to which block_id belongs *block_position_in_car:* integer array such that block_position_in_car[block_id] = position of block in car to which it belongs """ cars = [[]] # array of cars ordered by increasing car_id # array : block_id -> car_id to which block belongs block_to_car = [0 for _ in xrange(nb_blocks + 1)] # array : block_id -> position of block in its car block_position_in_car = [0 for _ in xrange(nb_blocks + 1)] with open(car_file_name, "r") as carlines: car_length = [0] current_car = 0 for line in carlines: if(line.strip()[0] != '#'): current_car += 1 car_string = ((line.split("_Q")[1]).split("Q_")[0]).split() car_length.append(len(car_string)) car_int = [int(cur_car) for cur_car in car_string] # array of signed integers for num, cur_car in enumerate(car_int): block_to_car[abs(cur_car)] = current_car block_position_in_car[abs(cur_car)] = num cars.append(car_int) return cars, block_to_car, block_position_in_car
[docs]def read_conflict_adj_file(adj_file, nb_species, leaves, tree, spe_ids): """ Read file containing previously discarded adjacencies, and yield them with their information Parameters ---------- adj_file : string File in which discarded adjacencies are written nb_species : int Total number of species leaves : list List of IDs of tree leaves (= genomes) tree : dict A tree structure spe_ids : dict Species as keys, and their corresponding ID as value Returns ------- tuple tuple yield for each adjacency = each line of the file: *labels:* dict with species as keys, and an int specifying if the adjacency is present (2) or absent (1) in the given species. *adj_id:* int, ID of current adjacency *adjacency:* tuple, current adjacency (num_bloc1, num_bloc2) *step_car_adj:* tuple, current car adjacency, type of adjacency, and step at which it was found (num_car1, num_car2, type, step) .. warning:: These Python objects are yield and not returned """ with open(adj_file, "r") as adjacency_lines: for adj_id, line in enumerate(adjacency_lines): words = line.split() adjacency = (int(words[0]), int(words[1])) # pair of signed blocks # car adjacency, type of adj and step at which this adjacency was added: step_car_adj = words[2:] # list of labels (given adjacency present=1/absent=0) for each species labels_list = words[-nb_species:] # nb_species last words are the labels # leave_nums = {leaf_num: leaf_name, ...} leave_nums = {leaf_num: tree[leaf_num][0] for leaf_num in leaves} # leave_ids = {leaf_num: (leaf_id in spe_ids), ...} leave_ids = {leaf_num: spe_ids[spe] for leaf_num, spe in leave_nums.iteritems()} # labels = {leaf_num: label (1=absent/2=present)} labels = {leaf_num: [int(labels_list[leaf_id]) + 1] for leaf_num, leaf_id in leave_ids.iteritems()} yield labels, adj_id, adjacency, step_car_adj
[docs]def write_retained_conflict_adjs(filename, adj_infos, maximum_set, adj_ids): """ Write retained adjacencies after a conflict resolution into a txt file -> next adjacency file Parameters ---------- filename : string Name of the file in which retained adjacencies are stored adj_infos : dict Dictionary with adjacency IDs as keys, and values are a list with the car adjacency (car1, car2), the type of adjacency, the step at which it was found and the presence of this adjacency in each species maximum_set : list List of retained adjacency ids adj_ids : dict Keys are adjacency IDs, and values are the adjacency corresponding to the ID """ with open(filename, "w") as output_file: # write added adjacencies at the beginning of the file for adj_id in maximum_set: # add block adj and car adj and type of adj with all information words = list(adj_ids[adj_id]) + list(adj_infos[adj_id][: 3]) words += [int(adj_infos[adj_id][3])] # step number words += [int(lab) for lab in adj_infos[adj_id][4:]] # labels str_words = [str(num) for num in words] str_words = " ".join(str_words) output_file.write(str_words + "\n")
[docs]def write_adjacency(output_file, cars, car_adjacency, adj_type, step_nb, labels): """ Writes all adjacency information into a file handler Parameters ---------- output_file : FileHandler Open file or StringIO in which writting status of adjacencies cars : list List of current cars car_adjacency : list List of two signed blocks (a given adjacency) adj_type : int Type of adjacency : 0 if fully, 1 if partly step_nb : int Current step of the ProCars Method labels : string More information (presence/absence of the adjacency in each species) """ signed_block1 = util_adjacency_functions.car_to_block(car_adjacency[0], 1, cars) signed_block2 = util_adjacency_functions.car_to_block(car_adjacency[1], 0, cars) output_file.write(str(signed_block1) + " " + str(signed_block2) + " " + str(car_adjacency[0]) + " " + str(car_adjacency[1]) + " " + str(adj_type) + " " + str(step_nb) + labels + "\n")
[docs]def write_car_file(car_filename, all_cars): """ Writes all CARs into the given file Parameters ---------- car_filename : String Name of the file in which all CARs are stored (= PQtree file) all_cars : list List of lists, such that all_cars[car_num] = [bloc1, bloc2, ...] = all ordered signed blocks of car number *car_num* """ with open(car_filename, "w") as carfile: for index_car, car in enumerate(all_cars): car_line = "_Q " car_line += " ".join([str(signed_block) for signed_block in car]) + " Q_\n" carfile.write("#CAR" + str(index_car + 1) + "\n") carfile.write(car_line)