forked from tsalo/convert-eprime
-
Notifications
You must be signed in to change notification settings - Fork 0
/
index_eprime_files.py
399 lines (328 loc) · 13.9 KB
/
index_eprime_files.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
# emacs: -*- mode: python-mode; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil -*-
# ex: set sts=4 ts=4 sw=4 et:
"""
Designed to check for the existence of paired edat/text files in a folder.
It will flag text files that do not have a paired edat or which have a paired
text (two text files).
Somewhat functional, but barely readable, as of 150209.
"""
from __future__ import print_function
from builtins import range
import re
import os
from os.path import isfile
import sys
import time
import json
import shutil
from glob import glob
import pandas as pd
def _add_subject(df, subj, timepoint, organized, organized_when, organized_by,
converted, converted_when, converted_by, notes):
"""
Adds information about subject's data to spreadsheet.
Parameters
----------
df : :obj:`pandas.DataFrame`
A dataframe for logging organized and converted behavioral data. A new
row will be added for each new file.
subj : str
Subject ID.
timepoint : str
Timepoint of new data.
organized : bool
Whether or not the file was successfully organized (copied from a
general folder for all new E-Prime files into a folder structure
organized by task, subject, and timepoint).
organized_when : str
The date when the file was organized.
organized_by : str
The initials of the person who organized the file. If this program
organized the file, then this should be 'PY'.
converted : bool
Whether or not the file was successfully converted to csv format.
converted_when : str
The date when the file was converted.
converted_by : str
The initials of the person who converted the file. If this program
organized the file, then this should be 'PY'.
notes : str
Relevant notes regarding the file (e.g., that multiple text files were
found).
Returns
-------
df : :obj:`pandas.DataFrame`
The input dataframe with a new row for the new file.
"""
row = pd.DataFrame([dict(Subject=subj, Timepoint=timepoint,
Organized=organized, Date_Organized=organized_when,
Organized_by=organized_by, Converted=converted,
Date_Converted=converted_when, Converted_by=converted_by,
Notes=notes)])
df = df.append(row, ignore_index=False)
return df
def _get_subject(text_file):
"""
Splits file name by hyphens to determine subject ID.
Parameters
----------
text_file : str
A text file generated by E-Prime. Should be in the format
blahblahblah_[subj]-[tp].txt or blahblahblah-[subj]-[tp].txt, with an
optional -Left_Handed at the end.
Returns
-------
subj : str
The subject ID associated with the input text file.
"""
path_name, _ = os.path.splitext(text_file)
fname = os.path.basename(path_name)
fname = fname.replace('-Left_Handed', '')
all_hyphens = [m.start() for m in re.finditer('-', fname)]
if len(all_hyphens) == 1:
beg = fname[:len(fname)-2].rindex('_')
else:
beg = all_hyphens[-2]
end = all_hyphens[-1]
subj = fname[beg+1:end]
subj = subj.lower()
return subj
def _get_timepoint(text_file):
"""
Splits file name by hyphens to determine timepoint.
Parameters
----------
text_file : str
A text file generated by E-Prime. Should be in the format
blahblahblah_[subj]-[tp].txt or blahblahblah-[subj]-[tp].txt, with an
optional -Left_Handed at the end.
Returns
-------
timepoint : str
The timepoint identifier associated with the input text file.
"""
path_with_filename, _ = os.path.splitext(text_file)
fname = os.path.basename(path_with_filename)
fname = fname.replace('-Left_Handed', '')
all_underscores = [m.start() for m in re.finditer('_', fname)]
last_hyphen = fname.rindex('-')
if not all_underscores:
# If no underscores in filename.
timepoint = fname[-1]
elif all_underscores[-1] < last_hyphen:
# If last underscore is before the last hyphen, use characters after
# last hyphen as timepoint.
timepoint = fname[-1]
else:
# If last underscore is after last hyphen, use characters between last
# hyphen and last underscore as timepoint.
timepoint = fname[last_hyphen+1:all_underscores[-1]]
return timepoint
def _organize_files(subject_id, timepoint, files, organized_dir):
"""
If there are no problems, copies edat and text files with known subject ID
and timepoint to organized directory and moves those files in the raw data dir
to a 'done' subfolder.
If the file already exists in the destination directory, it does not copy or
move the file and returns a note to that effect.
Parameters
----------
subject_id : str
The subject ID associated with the files to be organized.
timepoint : str
The timepoint (session) to which the files being organized belong.
files : list
Files to move into the organized_dir.
organized_dir : str
Output folder where subfolders named after the subject and the timepoint
will be created. The files will be moved into
[organized_dir]/[subject_id]/[timepoint]/.
Returns
-------
note : str
A note with relevant information (e.g., errors) related to organizing
the files.
"""
note = ''
for file_ in files:
orig_dir, file_name = os.path.split(file_)
# Create the destination dir if it doesn't already exist.
org_dir = os.path.join(organized_dir, subject_id, timepoint)
if not os.path.exists(org_dir):
os.makedirs(org_dir)
# If the file does not exist in the destination dir, copy it there and
# move the original to a 'done' subdir.
# If it does, return a note saying that the file exists.
if os.path.isfile(os.path.join(org_dir, file_name)):
note += 'File {0} already exists in {1}. '.format(file_name, org_dir)
else:
shutil.copy(file_, org_dir)
out_dir = os.path.join(orig_dir, 'done')
if not os.path.exists(out_dir):
os.mkdir(out_dir)
shutil.move(file_, out_dir)
return note
def main(directory, csv_file, param_file):
"""
This does so much. It needs to be documented.
Parameters
----------
directory : str
The directory containing raw E-Prime output files to be organized and
converted.
csv_file : str, optional
A csv file where organizations and conversions are logged. If the file
doesn't exist, a new one will be created.
param_file : str
A json file with task-specific settings (including orged_dir and
timepoint keys).
"""
note_dict = {
'one_text': 'One text file- must be recovered.',
'two_texts': 'Two text files- must be merged.',
'three_files': 'One edat and two text files- it\'s a thinker.',
'pair': 'All good.',
'one_edat': 'One edat and no text file(s)- unknown problem.',
}
# Read in data
with open(param_file, 'r') as fid:
param_dict = json.load(fid)
if isfile(csv_file):
df = pd.read_csv(csv_file)
else:
df = pd.DataFrame(columns=['Subject', 'Timepoint', 'Organized',
'Date_Organized', 'Organized_by', 'Converted',
'Date_Converted', 'Converted_by', 'Notes'])
columns = df.columns.tolist()
edat_files = glob(os.path.join(directory, '*.edat*')) # Grab edat and edat2 files
text_files = glob(os.path.join(directory, '*-*.txt')) # Text files need - for timepoint
all_files = sorted(edat_files + text_files)
pairs = []
for text_file in text_files:
[text_fname, _] = os.path.splitext(text_file)
for edat_file in edat_files:
[edat_fname, _] = os.path.splitext(edat_file)
if text_fname == edat_fname:
pairs.append([text_file, edat_file])
paired_texts = [pair[0] for pair in pairs]
unpaired_texts = list(set(text_files) - set(paired_texts))
three_files = []
pop_idx = []
# Find text files that correspond to a pair for a triad.
for i, up_text in enumerate(unpaired_texts):
for j, p_text in enumerate(paired_texts):
if up_text[:len(up_text)-6] in p_text:
three_files.append([p_text, pairs[j][1], up_text])
pop_idx.append(i)
# Remove files with buddies from list of unpaired files.
for rem_idx in reversed(pop_idx):
unpaired_texts.pop(rem_idx)
# three_files is the text files and edats that form a triad (one edat, two
# similarly named text files).
for triad in three_files:
for i_pair in reversed(list(range(len(pairs)))):
if triad[0:2] == pairs[i_pair]:
pairs.pop(i_pair)
# Find pairs of similarly named text files.
two_texts = []
all_two_texts = []
two_text_pairs = []
for i, up_text1 in enumerate(unpaired_texts):
for j in range(i + 1, len(unpaired_texts)):
up_text2 = unpaired_texts[j]
if up_text1[:len(up_text1)-6] in up_text2:
all_two_texts.append(i)
all_two_texts.append(j)
two_text_pairs.append([i, j])
all_two_texts = sorted(all_two_texts, reverse=True)
# two_texts is the text files that pair with other text files.
for tt_pair in two_text_pairs:
two_texts.append([unpaired_texts[tt_pair[0]], unpaired_texts[tt_pair[1]]])
for i_file in all_two_texts:
unpaired_texts.pop(i_file)
# one_text is the remaining un-paired text files. Place these in a list of
# lists to match the format of the other file lists.
one_text = [[f] for f in unpaired_texts]
# Determine subject IDs and timepoints for all files.
# Assumes that files will be named according to convention
# blahblahblah_[subj]-[tp].txt or blahblahblah-[subj]-[tp].txt.
one_text_subjects = [_get_subject(f[0]) for f in one_text]
one_text_timepoints = [_get_timepoint(f[0]) for f in one_text]
two_text_subjects = [_get_subject(pair[0]) for pair in two_texts]
two_text_timepoints = [_get_timepoint(pair[0]) for pair in two_texts]
three_file_subjects = [_get_subject(triad[0]) for triad in three_files]
three_file_timepoints = [_get_timepoint(triad[0]) for triad in three_files]
pair_subjects = [_get_subject(pair[0]) for pair in pairs]
pair_timepoints = [_get_timepoint(pair[0]) for pair in pairs]
# Place all accounted-for files in one list
af_files = ([item for sublist in pairs for item in sublist] +
[item for sublist in two_texts for item in sublist] +
[item for sublist in three_files for item in sublist] +
[item for sublist in one_text for item in sublist])
# one_edat lists the edat files without an associated text file. I'm not
# sure what would cause this situation to occur, so they're catalogued with
# their own message.
one_edat = list(set(all_files) - set(af_files))
one_edat = [[edat] for edat in one_edat]
one_edat_subjects = [_get_subject(f[0]) for f in one_edat]
one_edat_timepoints = [_get_timepoint(f[0]) for f in one_edat]
all_subjects = (one_text_subjects + two_text_subjects + three_file_subjects +
pair_subjects + one_edat_subjects)
all_notetype = ((['one_text'] * len(one_text_subjects)) +
(['two_texts'] * len(two_text_subjects)) +
(['three_files'] * len(three_file_subjects)) +
(['pair'] * len(pair_subjects)) +
(['one_edat'] * len(one_edat_subjects)))
all_timepoints = (one_text_timepoints + two_text_timepoints +
three_file_timepoints + pair_timepoints +
one_edat_timepoints)
all_file_sets = one_text + two_texts + three_files + pairs + one_edat
# Where organized files will be outputted
organized_dir = param_dict.get('org_dir')
for i, subj in enumerate(all_subjects):
session = all_timepoints[i]
timepoint_string = param_dict.get('timepoints').get(session)
files_note = note_dict.get(all_notetype[i])
files = all_file_sets[i]
print('Processing {0}- {1}'.format(subj, timepoint_string))
try:
note = _organize_files(subj, timepoint_string, files, organized_dir)
note += files_note
organized = True
organized_when = time.strftime('%Y/%m/%d')
organized_by = 'PY'
print('\tSuccessfully organized')
except IOError:
note = files_note
organized = False
organized_when = ''
organized_by = ''
print('\tCouldn\'t be organized.')
try:
if all_notetype[i] == 'pair':
converted = True
converted_when = time.strftime('%Y/%m/%d')
converted_by = 'PY'
print('\tSuccessfully converted')
else:
converted = False
converted_when = ''
converted_by = ''
print('\tCouldn\'t be converted.')
except IOError:
converted = False
converted_when = ''
converted_by = ''
print('\tCouldn\'t be converted.')
df = _add_subject(df, subj, timepoint_string,
organized, organized_when, organized_by,
converted, converted_when, converted_by, note)
df = df[columns]
df.to_csv(csv_file, index=False)
if __name__ == '__main__':
"""
If you call this function from the shell, the arguments are assumed
to be the raw data directory, the organization csv file, and the
task's param_file, in that order.
"""
main(sys.argv[1], sys.argv[2], sys.argv[3])