-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathhic2array.py
156 lines (143 loc) · 5.84 KB
/
hic2array.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
import numpy as np
from scipy.sparse import coo_matrix
import hicstraw
import os
import pickle
def write_pkl(data, path):
with open(path, 'wb') as f:
pickle.dump(data, f)
def read_chrom_array(chr1, chr2, normalization, hic_file, resolution):
chr1_name = chr1.name
chr2_name = chr2.name
infos = []
infos.append('observed')
infos.append(normalization)
infos.append(hic_file)
infos.append(chr1_name)
infos.append(chr2_name)
infos.append('BP')
infos.append(resolution)
print(infos)
row, col, val = [], [], []
rets = hicstraw.straw(*infos)
print('\tlen(rets): {:3e}'.format(len(rets)))
for ret in rets:
row.append((int)(ret.binX // resolution))
col.append((int)(ret.binY // resolution))
val.append(ret.counts)
print('\tsum(val): {:3e}'.format(sum(val)))
if sum(val) == 0:
return None
if chr1_name==chr2_name:
max_shape =max(max(row),max(col))+1
mat_coo = coo_matrix((val, (row, col)), shape = (max_shape,max_shape),dtype=np.float32)
else:
max_row = max(row)+1
max_column = max(col)+1
mat_coo = coo_matrix((val, (row, col)), shape = (max_row,max_column),dtype=np.float32)
mat_coo = mat_coo #+ triu(mat_coo, 1).T #no below diagonaline records
return mat_coo
def hic2array(input_hic,output_pkl=None,
resolution=25000,normalization="NONE",
tondarray=0):
"""
input_hic: str, input hic file path
output_pkl: str, output pickle file path
resolution: int, resolution of the hic file
"""
hic = hicstraw.HiCFile(input_hic)
chrom_list=[]
chrom_dict={}
for chrom in hic.getChromosomes():
print(chrom.name, chrom.length)
if "all" in chrom.name.lower():
continue
chrom_list.append(chrom)
chrom_dict[chrom.name]=chrom.length
resolution_list = hic.getResolutions()
if resolution not in resolution_list:
print("Resolution not found in the hic file, please choose from the following list:")
print(resolution_list)
exit()
output_dict={}
for i in range(len(chrom_list)):
for j in range(i,len(chrom_list)):
if i!=j and tondarray in [2,3]:
#skip inter-chromosome region
continue
chrom1 = chrom_list[i]
chrom1_name = chrom_list[i].name
chrom2 = chrom_list[j]
chrom2_name = chrom_list[j].name
if 'Un' in chrom1_name or 'Un' in chrom2_name:
continue
if "random" in chrom1_name.lower() or "random" in chrom2_name.lower():
continue
if "alt" in chrom1_name.lower() or "alt" in chrom2_name.lower():
continue
read_array=read_chrom_array(chrom1,chrom2, normalization, input_hic, resolution)
if read_array is None:
print("No data found for",chrom1_name,chrom2_name)
continue
if tondarray in [1,3]:
read_array = read_array.toarray()
if tondarray in [2,3]:
output_dict[chrom1_name]=read_array
else:
output_dict[chrom1_name+"_"+chrom2_name]=read_array
if output_pkl is not None:
output_dir = os.path.dirname(os.path.realpath(output_pkl))
os.makedirs(output_dir, exist_ok=True)
write_pkl(output_dict,output_pkl)
return output_dict
"""
Usage
```
python3 hic2array_simple.py [input.hic] [output.pkl] [resolution] [normalization_type] [mode]
```
This is the full cool2array script, converting both intra, inter chromosome regions to array format. <br>
The output array is saved in a pickle file as dict: [chrom1_chrom2]:[array] format. <br>
[resolution] is used to specify the resolution that stored in the output array. <br>
[normalization_type] supports the following type: <br>
```
0: NONE normalization applied, save the raw data to array.
1: VC normalization;
2: VC_SQRT normalization;
3: KR normalization;
4: SCALE normalization.
```
Four modes are supported for different format saving:
```
0: scipy coo_array format output;
1: numpy array format output;
2: scipy csr_array format output (only include intra-chromsome region).
3: numpy array format output (only include intra-chromsome region).
```
"""
if __name__ == '__main__':
import os
import sys
if len(sys.argv) != 6:
print('Usage: python3 hic2array_simple.py [input.hic] [output.pkl] [resolution] [normalization_type] [mode]')
print("This is the full hic2array script. ")
print("normalization type: 0: None normalization; 1: VC normalization; 2: VC_SQRT normalization; 3: KR normalization; 4: SCALE normalization")
print("mode: 0 for sparse matrix, 1 for dense matrix, 2 for sparce matrix (only cis-contact); 3 for dense matrix (only cis-contact).")
sys.exit(1)
resolution = int(sys.argv[3])
normalization_type = int(sys.argv[4])
mode = int(sys.argv[5])
normalization_dict={0:"NONE",1:"VC",2:"VC_SQRT",3:"KR",4:"SCALE"}
if normalization_type not in normalization_dict:
print('normalization type should be 0,1,2,3,4')
print("normalization type: 0: None normalization; 1: VC normalization; 2: VC_SQRT normalization; 3: KR normalization; 4: SCALE normalization")
sys.exit(1)
normalization_type = normalization_dict[normalization_type]
if mode not in [0,1,2,3]:
print('mode should be in choice of 0/1/2/3')
print("mode: 0 for sparse matrix, 1 for dense matrix, 2 for sparce matrix (only cis-contact); 3 for dense matrix (only cis-contact).")
sys.exit(1)
input_hic_path = os.path.abspath(sys.argv[1])
output_pkl_path = os.path.abspath(sys.argv[2])
output_dir = os.path.dirname(output_pkl_path)
os.makedirs(output_dir,exist_ok=True)
hic2array(input_hic_path,output_pkl_path,resolution,normalization_type,mode)