-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathicesParsing.R
185 lines (156 loc) · 6.82 KB
/
icesParsing.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
#' @noRd
initIC <- function(){
output <- list()
output$HI <- data.table::data.table(Country=character(), Year=character(), SeasonType=character(), Season=integer(), Fleet=character(), AreaType=character(), FishingArea=character(), DepthRange=character(),
UnitEffort=character(), Effort=integer(), AreaQualifier=character())
output$SI <- data.table::data.table(Country=character(), Year=character(), SeasonType=character(), Season=integer(), Fleet=character(), AreaType=character(), FishingArea=character(), DepthRange=character(),
Species=character(), Stock=character(), CatchCategory=character(), ReportingCategory=character(), DataToFrom=character(), Usage=character(), SamplesOrigin=character(), QualityFlag=character(), UnitCATON=character(), CATON=numeric(), OffLandings=integer(), varCATON=numeric(), InfoFleet=character(), InfoStockCoordinator=character(), InfoGeneral=character())
output$SD <- data.table::data.table(Country=character(), Year=character(), SeasonType=character(), Season=integer(), Fleet=character(), AreaType=character(), FishingArea=character(), DepthRange=character(),
Species=character(), Stock=character(), CatchCategory=character(), ReportingCategory=character(),
Sex=character(), CANUMtype=character(), AgeLength=integer(), PlusGroup=integer(), SampledCatch=integer(), NumSamplesLngt=integer(), NumLngtMeas=integer(), NumSamplesAge=integer(), NumAgeMeas=integer(), unitMeanWeight=character(), unitCANUM=character(), UnitAgeOrLength=character(), UnitMeanLength=character(), Maturity=character(), NumberCaught=numeric(), MeanWeight=numeric(), MeanLength=numeric(), varNumLanded=numeric(), varWgtLanded=numeric(), varLgtLanded=numeric())
return(output)
}
#' Missing value for Int may be encoded as -9
#' @noRd
parseInt <- function(chr){
if (is.na(chr) | chr=="-9"){
return(as.integer(NA))
}
else{
return(as.integer(chr))
}
}
#' Missing value for Int may be encoded as -9
#' The intercatch manual is a bit ambigious on how missing values for decimal data types should be encoded
#' It contains examples that specify them as -9, and practical tests indicate that -9 is required for some of them.
#' @noRd
parseNumeric <- function(chr){
if (is.na(chr) | chr=="-9"){
return(as.numeric(NA))
}
else{
return(as.numeric(chr))
}
}
#' Adds any missing non-mandatory charachter fields to intercatch tables
#' @param row data.table with parsed headerless rows
#' @param lastNeededPosition last position of the table that is either mandatory or non-character
#' @param headers for the parsed table
#' @noRd
appendFields <- function(row, lastNeededPosition, headers){
if (ncol(row) != length(headers)){
if (ncol(row)>length(headers)){
stop("Malformed InterCatch 1.0 file.")
}
if (ncol(row) < lastNeededPosition){
stop("Malformed InterCatch 1.0 file.")
}
row <- cbind(row, rep(NA, length(headers)-ncol(row)))
}
return(row)
}
#' @noRd
processHI <- function(vec, output){
stopifnot(vec[1]=="HI")
row <- data.table::data.table(t(vec[2:length(vec)]))
row <- appendFields(row, 11, names(output$HI))
names(row) <- names(output$HI)
#characters may be encoded as NA for missing values
row[row=="NA"] <- as.character(NA)
#integers may be encoded as -9 for missing values
row$Season <- parseInt(row$Season)
row$Effort <- parseInt(row$Effort)
output$HI <- rbind(output$HI, row)
return(output)
}
#' @noRd
processSI <- function(vec, output){
stopifnot(vec[1]=="SI")
row <- data.table::data.table(t(vec[2:length(vec)]))
row <- appendFields(row, 21, names(output$SI))
names(row) <- names(output$SI)
#characters may be encoded as NA for missing values
row[row=="NA"] <- NA
#integers may be encoded as -9 for missing values
row$Season <- parseInt(row$Season)
row$OffLandings <- parseInt(row$OffLandings)
#numeric variabes
row$CATON <- parseNumeric(row$CATON)
row$varCATON <- parseNumeric(row$varCATON)
output$SI <- rbind(output$SI, row)
return(output)
}
#' @noRd
processSD <- function(vec, output){
stopifnot(vec[1]=="SD")
row <- data.table::data.table(t(vec[2:length(vec)]))
row <- appendFields(row, 33, names(output$SD))
names(row) <- names(output$SD)
#characters may be encoded as NA for missing values
row[row=="NA"] <- NA
#integers may be encoded as -9 for missing values
row$Season <- parseInt(row$Season)
row$AgeLength <- parseInt(row$AgeLength)
row$PlusGroup <- parseInt(row$PlusGroup)
row$SampledCatch <- parseInt(row$SampledCatch)
row$NumSamplesLngt <- parseInt(row$NumSamplesLngt)
row$NumLngtMeas <- parseInt(row$NumLngtMeas)
row$NumSamplesAge <- parseInt(row$NumSamplesAge)
row$NumAgeMeas <- parseInt(row$NumAgeMeas)
#numeric variables
row$NumberCaught <- parseNumeric(row$NumberCaught)
row$MeanWeight <- parseNumeric(row$MeanWeight)
row$MeanLength <- parseNumeric(row$MeanLength)
row$varNumLanded <- parseNumeric(row$varNumLanded)
row$varWgtLanded <- parseNumeric(row$varWgtLanded)
row$varLgtLanded <- parseNumeric(row$varLgtLanded)
output$SD <- rbind(output$SD, row)
return(output)
}
#' Parses InterCatch
#' @description
#' Parses the InterCatch exchange format v 1.0 for Commercial Catch and Sample Data.
#' Parses HI,SI and SD recrods
#' @details
#' The InterCatch exchange format is a jagged comma-separated format,
#' where the number of fields on a line is determined by a record-type identifier in position 1.
#' Three record types are defined, "HI" (header information), "SI" (species information), and "SD" (species data).
#' The format it specified on https://ices.dk/data/Documents/Intercatch/IC-ExchangeFormat1-0.pdf.
#'
#' @param file path to file containing intercatch formatted data
#' @param encoding encoding of 'file'
#' @return named list with three members:
#' \describe{
#' \item{HI}{\code{\link[data.table]{data.table}} with HI records}
#' \item{SI}{\code{\link[data.table]{data.table}} with SI records}
#' \item{SD}{\code{\link[data.table]{data.table}} with SD records}
#' }
#' @importFrom data.table as.data.table
#' @export
parseInterCatch <- function(file, encoding="UTF-8"){
output <- initIC()
stream <- file(file, encoding = encoding)
lines <- readLines(stream)
close(stream)
for (l in lines){
#add and remove trailing field to ensure consistent splitting
vec <- strsplit(paste(l,"T",sep=","), ",")[[1]]
vec <- vec[1:length(vec)-1]
if (vec[1] == "HI"){
output <- processHI(vec, output)
}
else if (vec[1] == "SI"){
output <- processSI(vec, output)
}
else if (vec[1] == "SD"){
output <- processSD(vec, output)
}
else if (vec == ""){
#pass empty lines
}
else{
stop(paste("Record type", vec[1], "not recognized."))
}
}
return(output)
}