Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for loading simple cell type annotations from GEO supplem… #1311

Draft
wants to merge 1 commit into
base: feature-single-cell
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -488,7 +488,7 @@ private GeoSeries readSeriesFromGeo( String accession ) throws IOException {
if ( downloadedBytes > 0 ) {
log.info( String.format( "%s: Done downloading SOFT file (%s in %s @ %.3f MB/s).", accession,
FileUtils.byteCountToDisplaySize( downloadedBytes ), timer,
( 1000.0 / ( 1000.0 * 1000.0 ) ) * ( downloadedBytes / timer.getTime() ) ) );
( 1000.0 / ( 1000.0 * 1000.0 ) ) * ( ( double ) downloadedBytes / ( double ) timer.getTime() ) ) );
}
} catch ( IOException e ) {
if ( Files.exists( dest ) ) {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,157 @@
package ubic.gemma.core.loader.expression.geo.singleCell;

import lombok.extern.apachecommons.CommonsLog;
import org.apache.commons.csv.CSVFormat;
import org.apache.commons.csv.CSVParser;
import org.apache.commons.csv.CSVRecord;
import org.apache.commons.lang3.StringUtils;
import ubic.gemma.core.loader.expression.geo.model.GeoSeries;
import ubic.gemma.core.loader.expression.singleCell.AbstractDelegatingSingleCellDataLoader;
import ubic.gemma.core.loader.expression.singleCell.SingleCellDataLoader;
import ubic.gemma.core.loader.util.ftp.FTPClientFactory;
import ubic.gemma.model.common.description.Categories;
import ubic.gemma.model.common.description.Characteristic;
import ubic.gemma.model.expression.bioAssayData.CellTypeAssignment;
import ubic.gemma.model.expression.bioAssayData.SingleCellDimension;

import javax.annotation.Nullable;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.URL;
import java.util.*;

/**
* Loads author-provided cell type from GEO supplementary files.
* <p>
* This loader makes best efforts to cover as many cases as possible.
* <p>
* Note that in the best of worlds, cell type assignments are properly structured and loaded with
* {@link ubic.gemma.core.loader.expression.singleCell.GenericMetadataSingleCellDataLoader}.
* @author poirigui
*/
@CommonsLog
public class GeoCellTypeAssignmentLoader extends AbstractDelegatingSingleCellDataLoader {

/**
* A list of CSV formats to attempt to use.
*/
private static final CSVFormat[] csvFormats = { CSVFormat.TDF, CSVFormat.EXCEL };

private final GeoSeries series;

private FTPClientFactory ftpClientFactory;

public GeoCellTypeAssignmentLoader( GeoSeries series, SingleCellDataLoader delegate ) {
super( delegate );
this.series = series;
}

@Override
public Set<CellTypeAssignment> getCellTypeAssignments( SingleCellDimension dimension ) throws IOException {
Set<CellTypeAssignment> assignments = new HashSet<>();
for ( String file : series.getSupplementaryFiles() ) {
// detect possible candidates
for ( int i = 0; i < csvFormats.length; i++ ) {
CSVFormat format = csvFormats[i];
try {
assignments.addAll( parseCellType( file, format, dimension ) );
break;
} catch ( Exception e ) {
if ( i < csvFormats.length - 1 ) {
log.warn( "Failed to parse cell type assignment from " + file + " with " + format + ", trying another format.", e );
} else {
log.warn( "Failed to parse cell type assignment from " + file + ", it will be ignored.", e );
}
}
}
}
// also include the assignment detected from the delegate
assignments.addAll( super.getCellTypeAssignments( dimension ) );
return assignments;
}

private List<CellTypeAssignment> parseCellType( String file, CSVFormat csvFormat, SingleCellDimension dimension ) throws IOException {
try ( CSVParser reader = csvFormat.parse( new InputStreamReader( openFile( file ) ) ) ) {
int sampleNameColumn = detectSampleNameColumn( reader );
int cellIdColumn = detectCellIdColumn( reader );
int[] cellTypeColumns = detectCellTypeColumns( reader );

// those have matching indices
List<CellTypeAssignment> assignments = new ArrayList<>( cellTypeColumns.length );
List<Map<String, Characteristic>> cellTypes = new ArrayList<>( cellTypeColumns.length );
for ( int c : cellTypeColumns ) {
CellTypeAssignment cta = CellTypeAssignment.Factory.newInstance( reader.getHeaderNames().get( c ) );
cta.setCellTypeIndices( new int[dimension.getNumberOfCells()] );
Arrays.fill( cta.getCellTypeIndices(), CellTypeAssignment.UNKNOWN_CELL_TYPE );
assignments.add( cta );
cellTypes.add( new HashMap<>() );
}

for ( CSVRecord record : reader ) {
String sampleName = StringUtils.stripToNull( record.get( sampleNameColumn ) );
String cellId = StringUtils.stripToNull( record.get( cellIdColumn ) );
int cellIndex = getCellIndex( sampleName, cellId, dimension );
for ( int i = 0; i < cellTypeColumns.length; i++ ) {
CellTypeAssignment assignment = assignments.get( i );
String cellType = StringUtils.stripToNull( record.get( cellTypeColumns[i] ) );
int ix;
if ( cellTypes.get( i ).containsKey( cellType ) ) {
ix = assignment.getCellTypes().indexOf( cellTypes.get( i ).get( cellType ) );
} else if ( isNa( cellType ) ) {
ix = CellTypeAssignment.UNKNOWN_CELL_TYPE;
log.debug( "Cell type is missing for " + sampleName + ":" + cellId + ", will be encoded as " + CellTypeAssignment.UNKNOWN_CELL_TYPE + "." );
} else {
Characteristic c = Characteristic.Factory.newInstance( Categories.CELL_TYPE, cellType, null );
assignment.getCellTypes().add( c );
assignment.setNumberOfCellTypes( assignment.getCellTypes().size() );
ix = assignment.getCellTypes().size() - 1;
log.info( "New cell type detected: " + c + ", it will be encoded with " + ix + "." );
}

assignment.getCellTypeIndices()[cellIndex] = ix;
}
}
return assignments;
}
}

private InputStream openFile( String file ) throws IOException {
// TODO: gzip decompression, HTTP download, etc.
return ftpClientFactory.openStream( new URL( file ) );
}

/**
* Check if a cell type indicator is for missing data.
*/
private boolean isNa( @Nullable String cellType ) {
// TODO: implement other possible indicators
return cellType == null;
}

private int getCellIndex( String sampleName, String cellId, SingleCellDimension dimension ) {
return 0;

}

/**
* Detect the column that contains the sample name.
*/
private int detectSampleNameColumn( CSVParser parser ) {
return 0;
}

/**
* Detect the column that contains the cell ID.
*/
private int detectCellIdColumn( CSVParser parser ) {
return 1;
}

/**
* Detect all the columns that could contain cell type information.
*/
private int[] detectCellTypeColumns( CSVParser parser ) {
return new int[] { 2, 3, 4 };
}
}
Original file line number Diff line number Diff line change
@@ -1,6 +1,12 @@
package ubic.gemma.core.loader.expression.singleCell;

import lombok.extern.apachecommons.CommonsLog;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils;
import org.apache.commons.io.file.PathUtils;
import org.apache.commons.lang3.time.StopWatch;
import org.apache.commons.net.ftp.FTPClient;
import org.apache.commons.net.ftp.FTPFile;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.stereotype.Service;
Expand All @@ -9,7 +15,12 @@
import ubic.gemma.core.loader.expression.DesignElementMapper;
import ubic.gemma.core.loader.expression.EnsemblIdDesignElementMapper;
import ubic.gemma.core.loader.expression.MapBasedDesignElementMapper;
import ubic.gemma.core.loader.expression.geo.GeoFamilyParser;
import ubic.gemma.core.loader.expression.geo.model.GeoSeries;
import ubic.gemma.core.loader.expression.geo.singleCell.GeoBioAssayToSampleNameMatcher;
import ubic.gemma.core.loader.expression.geo.singleCell.GeoCellTypeAssignmentLoader;
import ubic.gemma.core.loader.util.ftp.FTPClientFactory;
import ubic.gemma.core.util.ProgressInputStream;
import ubic.gemma.model.common.description.Categories;
import ubic.gemma.model.common.description.Characteristic;
import ubic.gemma.model.common.description.ExternalDatabases;
Expand All @@ -30,12 +41,18 @@
import ubic.gemma.persistence.service.expression.experiment.ExpressionExperimentService;
import ubic.gemma.persistence.service.expression.experiment.SingleCellExpressionExperimentService;

import javax.annotation.Nullable;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.net.URL;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.*;
import java.util.function.Function;
import java.util.stream.Collectors;
import java.util.zip.GZIPInputStream;

import static java.util.Objects.requireNonNull;

Expand All @@ -58,9 +75,15 @@ public class SingleCellDataLoaderServiceImpl implements SingleCellDataLoaderServ
@Autowired
private QuantitationTypeService quantitationTypeService;

@Autowired
private FTPClientFactory ftpClientFactory;

@Value("${gemma.download.path}/singleCellData")
private Path singleCellDataBasePath;

@Value("${geo.local.datafile.basepath}")
private File geoSeriesDownloadPath;

@Override
@Transactional
public QuantitationType load( ExpressionExperiment ee, ArrayDesign platform, SingleCellDataLoaderConfig config ) {
Expand Down Expand Up @@ -497,6 +520,17 @@ private SingleCellDataLoader configureLoader( SingleCellDataLoader loader, Expre
}
// apply GEO strategy for matching
if ( ee.getAccession() != null && ee.getAccession().getExternalDatabase().getName().equals( ExternalDatabases.GEO ) ) {
try {
// include CTAs from the series
GeoSeries series = readSeriesFromGeo( ee.getAccession().getAccession() );
if ( series != null ) {
loader = new GeoCellTypeAssignmentLoader( series, loader );
} else {
log.warn( "No series file were found for " + ee + ", will not be able to load cell type assignments." );
}
} catch ( IOException e ) {
log.warn( "Failed to read series file for " + ee + ", will not be able to load cell type assignments." );
}
loader.setBioAssayToSampleNameMatcher( new GeoBioAssayToSampleNameMatcher() );
} else {
log.info( String.format( "%s does not have a GEO accession, using %s for matching sample names to BioAssays.",
Expand Down Expand Up @@ -591,4 +625,52 @@ private Path getLoomFile( ExpressionExperiment ee ) {
.resolve( ee.getAccession().getExternalDatabase().getName() )
.resolve( ee.getAccession().getAccession() + ".loom" );
}

@Nullable
private GeoSeries readSeriesFromGeo( String accession ) throws IOException {
String remoteFile = String.format( "geo/series/%snnn/%s/soft/%s_family.soft.gz",
accession.substring( 0, accession.length() - 3 ), accession, accession );
URL softFileUrl = new URL( "ftp://ftp.ncbi.nlm.nih.gov/" + remoteFile );
Path dest = geoSeriesDownloadPath.toPath().resolve( accession ).resolve( accession + ".soft.gz" );
boolean download = true;
if ( Files.exists( dest ) ) {
FTPClient client = ftpClientFactory.getFtpClient( softFileUrl );
try {
FTPFile res = client.mlistFile( remoteFile );
long expectedLength = res != null ? res.getSize() : -1;
if ( expectedLength != -1 && dest.toFile().length() == expectedLength ) {
log.info( accession + ": Using existing SOFT file " + dest + "." );
download = false;
}
ftpClientFactory.recycleClient( softFileUrl, client );
} catch ( Exception e ) {
ftpClientFactory.destroyClient( softFileUrl, client );
throw e;
}
}
if ( download ) {
log.info( accession + ": Downloading SOFT file to " + dest + "..." );
PathUtils.createParentDirectories( dest );
StopWatch timer = StopWatch.createStarted();
try ( InputStream in = new ProgressInputStream( ftpClientFactory.openStream( softFileUrl ), accession + ".soft.gz", SingleCellDataLoaderService.class.getName() ); OutputStream out = Files.newOutputStream( dest ) ) {
int downloadedBytes = IOUtils.copy( in, out );
if ( downloadedBytes > 0 ) {
log.info( String.format( "%s: Done downloading SOFT file (%s in %s @ %.3f MB/s).", accession,
FileUtils.byteCountToDisplaySize( downloadedBytes ), timer,
( 1000.0 / ( 1000.0 * 1000.0 ) ) * ( ( double ) downloadedBytes / ( double ) timer.getTime() ) ) );
}
} catch ( IOException e ) {
if ( Files.exists( dest ) ) {
log.warn( accession + ": An I/O error occurred while downloading the SOFT file, removing " + dest + "...", e );
PathUtils.deleteDirectory( dest.getParent() );
}
throw e;
}
}
try ( InputStream is = new GZIPInputStream( Files.newInputStream( dest ) ) ) {
GeoFamilyParser parser = new GeoFamilyParser();
parser.parse( is );
return requireNonNull( parser.getUniqueResult() ).getSeriesMap().get( accession );
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -129,8 +129,14 @@ public String toString() {

public static class Factory {

public static CellTypeAssignment newInstance( String name ) {
CellTypeAssignment cta = newInstance( name );
cta.setName( name );
return cta;
}

public static CellTypeAssignment newInstance( String name, List<Characteristic> characteristics, int[] indices ) {
CellTypeAssignment cta = new CellTypeAssignment();
CellTypeAssignment cta = newInstance( name );
cta.setName( name );
cta.setCellTypes( characteristics );
cta.setCellTypeIndices( indices );
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
package ubic.gemma.core.loader.expression.geo.singleCell;

import org.junit.Test;
import ubic.gemma.core.loader.expression.geo.model.GeoSeries;

import static org.mockito.Mockito.mock;

public class GeoCellTypeAssignmentLoaderTest {

@Test
public void testGSE() {
GeoSeries series;
GeoCellTypeAssignmentLoader loader = new GeoCellTypeAssignmentLoader( series, mock() );
loader.getCellTypeAssignments( dimension );
}
}