Skip to content

Commit 598496b

Browse files
committed
fix: CSV importer - skip null fields + auto set vertices import
Fixed issue #3818
1 parent bd5053b commit 598496b

5 files changed

Lines changed: 133 additions & 40 deletions

File tree

integration/src/main/java/com/arcadedb/integration/importer/Importer.java

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,14 @@ public Map<String, Object> load() {
5050

5151
startImporting();
5252

53-
loadFromSource(settings.url, AnalyzedEntity.EntityType.DATABASE, analyzedSchema);
53+
// Determine entity type for the main URL: if vertexType or edgeType is explicitly set, route accordingly
54+
AnalyzedEntity.EntityType urlEntityType = AnalyzedEntity.EntityType.DATABASE;
55+
if (settings.options.containsKey("vertexType") && settings.vertices == null)
56+
urlEntityType = AnalyzedEntity.EntityType.VERTEX;
57+
else if (settings.options.containsKey("edgeType") && settings.edges == null)
58+
urlEntityType = AnalyzedEntity.EntityType.EDGE;
59+
60+
loadFromSource(settings.url, urlEntityType, analyzedSchema);
5461
loadFromSource(settings.documents, AnalyzedEntity.EntityType.DOCUMENT, analyzedSchema);
5562
loadFromSource(settings.vertices, AnalyzedEntity.EntityType.VERTEX, analyzedSchema);
5663
loadFromSource(settings.edges, AnalyzedEntity.EntityType.EDGE, analyzedSchema);

integration/src/main/java/com/arcadedb/integration/importer/SourceDiscovery.java

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -197,12 +197,16 @@ private FormatImporter analyzeSourceContent(final Parser parser, final AnalyzedE
197197
break;
198198

199199
case VERTEX:
200-
knownFileType = settings.verticesFileType != null ? settings.verticesFileType : getFileTypeByExtension(settings.vertices);
200+
knownFileType = settings.verticesFileType != null ?
201+
settings.verticesFileType :
202+
getFileTypeByExtension(settings.vertices != null ? settings.vertices : settings.url);
201203
knownDelimiter = settings.verticesDelimiter;
202204
break;
203205

204206
case EDGE:
205-
knownFileType = settings.edgesFileType != null ? settings.edgesFileType : getFileTypeByExtension(settings.edgeTypeName);
207+
knownFileType = settings.edgesFileType != null ?
208+
settings.edgesFileType :
209+
getFileTypeByExtension(settings.edges != null ? settings.edges : settings.url);
206210
knownDelimiter = settings.edgesDelimiter;
207211
break;
208212

integration/src/main/java/com/arcadedb/integration/importer/format/CSVImporterFormat.java

Lines changed: 31 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -127,8 +127,11 @@ private void loadDocuments(final SourceSchema sourceSchema, final Parser parser,
127127

128128
final MutableDocument document = database.newDocument(settings.documentTypeName);
129129

130-
for (final AnalyzedProperty prop : properties)
131-
document.set(prop.getName(), row[prop.getIndex()]);
130+
for (final AnalyzedProperty prop : properties) {
131+
final String value = row[prop.getIndex()];
132+
if (value != null && !value.isEmpty())
133+
document.set(prop.getName(), value);
134+
}
132135

133136
document.save();
134137
context.createdDocuments.incrementAndGet();
@@ -154,44 +157,34 @@ private void loadDocuments(final SourceSchema sourceSchema, final Parser parser,
154157
private void loadVertices(final SourceSchema sourceSchema, final Parser parser, final Database database,
155158
final ImporterContext context, final ImporterSettings settings) throws ImportException {
156159

157-
if (settings.typeIdProperty == null) {
158-
LogManager.instance()
159-
.log(this, Level.INFO, "Property id was not defined. Set `-typeIdProperty <name>`. Importing is aborted", null,
160-
settings.vertexTypeName, settings.typeIdProperty);
161-
throw new IllegalArgumentException("Property id was not defined. Set `-typeIdProperty <name>`. Importing is aborted");
162-
}
163-
164160
final AnalyzedEntity entity = sourceSchema.getSchema().getEntity(settings.vertexTypeName);
165161
if (entity == null) {
166162
LogManager.instance().log(this, Level.INFO, "Vertex type '%s' not defined", null, settings.vertexTypeName);
167163
return;
168164
}
169165

170-
final AnalyzedProperty id = entity.getProperty(settings.typeIdProperty);
166+
int idIndex = -1;
167+
if (settings.typeIdProperty != null) {
168+
final AnalyzedProperty id = entity.getProperty(settings.typeIdProperty);
171169

172-
if (id == null) {
173-
LogManager.instance()
174-
.log(this, Level.INFO, "Property Id '%s.%s' is null. Importing is aborted", null, settings.vertexTypeName,
175-
settings.typeIdProperty);
176-
throw new IllegalArgumentException(
177-
"Property Id '" + settings.vertexTypeName + "." + settings.typeIdProperty + "' is null. Importing is aborted");
178-
}
170+
if (id == null) {
171+
LogManager.instance()
172+
.log(this, Level.INFO, "Property Id '%s.%s' is null. Importing is aborted", null, settings.vertexTypeName,
173+
settings.typeIdProperty);
174+
throw new IllegalArgumentException(
175+
"Property Id '" + settings.vertexTypeName + "." + settings.typeIdProperty + "' is null. Importing is aborted");
176+
}
179177

180-
long expectedVertices = settings.expectedVertices;
181-
if (expectedVertices <= 0)
182-
expectedVertices = (int) (sourceSchema.getSource().totalSize / entity.getAverageRowLength());
183-
if (expectedVertices <= 0)
184-
expectedVertices = 1000000;
185-
else if (expectedVertices > Integer.MAX_VALUE)
186-
expectedVertices = Integer.MAX_VALUE;
187-
188-
// Ensure the typeIdProperty has a unique index for edge resolution
189-
if (!database.getSchema().getType(settings.vertexTypeName).existsProperty(settings.typeIdProperty))
190-
database.transaction(
191-
() -> database.getSchema().getType(settings.vertexTypeName).createProperty(settings.typeIdProperty, com.arcadedb.schema.Type.STRING));
192-
if (database.getSchema().getType(settings.vertexTypeName).getIndexesByProperties(settings.typeIdProperty).isEmpty())
193-
database.transaction(
194-
() -> database.getSchema().getType(settings.vertexTypeName).createTypeIndex(Schema.INDEX_TYPE.LSM_TREE, true, settings.typeIdProperty));
178+
idIndex = id.getIndex();
179+
180+
// Ensure the typeIdProperty has a unique index for edge resolution
181+
if (!database.getSchema().getType(settings.vertexTypeName).existsProperty(settings.typeIdProperty))
182+
database.transaction(
183+
() -> database.getSchema().getType(settings.vertexTypeName).createProperty(settings.typeIdProperty, com.arcadedb.schema.Type.STRING));
184+
if (database.getSchema().getType(settings.vertexTypeName).getIndexesByProperties(settings.typeIdProperty).isEmpty())
185+
database.transaction(
186+
() -> database.getSchema().getType(settings.vertexTypeName).createTypeIndex(Schema.INDEX_TYPE.LSM_TREE, true, settings.typeIdProperty));
187+
}
195188

196189
final AbstractParser<?> csvParser = createCSVParser(settings);
197190

@@ -209,8 +202,6 @@ else if (expectedVertices > Integer.MAX_VALUE)
209202
DatabaseFactory.getDefaultCharset())) {
210203
csvParser.beginParsing(inputFileReader);
211204

212-
final int idIndex = id.getIndex();
213-
214205
final List<AnalyzedProperty> properties = new ArrayList<>();
215206
if (!settings.vertexPropertiesInclude.isEmpty() && !settings.vertexPropertiesInclude.equalsIgnoreCase("*")) {
216207
final String[] includes = settings.vertexPropertiesInclude.split(",");
@@ -231,18 +222,21 @@ else if (expectedVertices > Integer.MAX_VALUE)
231222
if (skipEntries > 0 && line < skipEntries)
232223
continue;
233224

234-
if (idIndex >= row.length) {
225+
if (idIndex >= 0 && idIndex >= row.length) {
235226
LogManager.instance()
236227
.log(this, Level.INFO, "Property Id is configured on property %d but cannot be found on current record. Skip it",
237228
null, idIndex);
238229
continue;
239230
}
240231

241232
final MutableVertex v = database.newVertex(settings.vertexTypeName);
242-
v.set(settings.typeIdProperty, row[idIndex]);
233+
if (idIndex >= 0)
234+
v.set(settings.typeIdProperty, row[idIndex]);
243235
for (int p = 0; p < properties.size(); ++p) {
244236
final AnalyzedProperty prop = properties.get(p);
245-
v.set(prop.getName(), row[prop.getIndex()]);
237+
final String value = row[prop.getIndex()];
238+
if (value != null && !value.isEmpty())
239+
v.set(prop.getName(), value);
246240
}
247241
database.async().createRecord(v, doc -> context.createdVertices.incrementAndGet());
248242
}

integration/src/test/java/com/arcadedb/integration/importer/CSVImporterIT.java

Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,12 +20,14 @@
2020

2121
import com.arcadedb.database.Database;
2222
import com.arcadedb.database.DatabaseFactory;
23+
import com.arcadedb.database.Document;
2324
import com.arcadedb.graph.Vertex;
2425
import com.arcadedb.integration.TestHelper;
2526
import org.junit.jupiter.api.Test;
2627

2728
import java.io.File;
2829
import java.nio.file.Files;
30+
import java.util.Set;
2931

3032
import static org.assertj.core.api.Assertions.assertThat;
3133
import static org.assertj.core.api.Assertions.assertThatThrownBy;
@@ -345,6 +347,88 @@ void regressionIssue2267SeparateVertexTypeImports() throws Exception {
345347
TestHelper.checkActiveDatabases();
346348
}
347349

350+
/**
351+
* Test that sparse CSV import skips null/empty values instead of storing them.
352+
* With 4K columns and most being empty, records should only contain properties with actual values.
353+
*/
354+
@Test
355+
void importSparseCSVSkipsNullValues() {
356+
final String databasePath = "target/databases/test-import-sparse";
357+
358+
final DatabaseFactory databaseFactory = new DatabaseFactory(databasePath);
359+
if (databaseFactory.exists())
360+
databaseFactory.open().drop();
361+
362+
final Database db = databaseFactory.create();
363+
try {
364+
db.command("sql", """
365+
IMPORT DATABASE file://src/test/resources/importer-sparse.csv
366+
WITH maxProperties=1000, maxPropertySize=8192
367+
""");
368+
369+
assertThat(db.countType("Document", true)).isEqualTo(3);
370+
371+
// Check first row: Id=1, Name=Tesla Model 3, Engine=Electric, Seats=5, GPS=true
372+
// Color, Doors, Sunroof should NOT be stored
373+
db.iterateType("Document", true).forEachRemaining(record -> {
374+
final Document doc = record.asDocument();
375+
if ("1".equals(doc.getString("Id"))) {
376+
assertThat(doc.has("Name")).isTrue();
377+
assertThat(doc.has("Engine")).isTrue();
378+
assertThat(doc.has("GPS")).isTrue();
379+
// These should not be present - they were empty in the CSV
380+
assertThat(doc.has("Color")).as("Empty Color should not be stored").isFalse();
381+
assertThat(doc.has("Doors")).as("Empty Doors should not be stored").isFalse();
382+
assertThat(doc.has("Sunroof")).as("Empty Sunroof should not be stored").isFalse();
383+
}
384+
});
385+
} finally {
386+
db.drop();
387+
}
388+
TestHelper.checkActiveDatabases();
389+
}
390+
391+
/**
392+
* Test that IMPORT DATABASE with vertexType creates vertices, not documents.
393+
*/
394+
@Test
395+
void importCSVAsVerticesViaVertexType() {
396+
final String databasePath = "target/databases/test-import-vertex-type";
397+
398+
final DatabaseFactory databaseFactory = new DatabaseFactory(databasePath);
399+
if (databaseFactory.exists())
400+
databaseFactory.open().drop();
401+
402+
final Database db = databaseFactory.create();
403+
try {
404+
db.command("sql", """
405+
IMPORT DATABASE file://src/test/resources/importer-sparse.csv
406+
WITH vertexType=Car, maxProperties=1000, maxPropertySize=8192
407+
""");
408+
409+
// Should create vertices, not documents
410+
assertThat(db.getSchema().existsType("Car")).isTrue();
411+
assertThat(db.countType("Car", true)).isEqualTo(3);
412+
413+
// Verify they are actual vertices
414+
db.iterateType("Car", true).forEachRemaining(record -> {
415+
assertThat(record.asDocument()).isInstanceOf(Vertex.class);
416+
});
417+
418+
// Also verify sparse values are skipped
419+
db.iterateType("Car", true).forEachRemaining(record -> {
420+
final Vertex v = record.asVertex();
421+
if ("1".equals(v.getString("Id"))) {
422+
assertThat(v.has("Name")).isTrue();
423+
assertThat(v.has("Color")).as("Empty Color should not be stored").isFalse();
424+
}
425+
});
426+
} finally {
427+
db.drop();
428+
}
429+
TestHelper.checkActiveDatabases();
430+
}
431+
348432
/**
349433
* Regression test for GitHub issue #3713: IMPORT DATABASE rejects edgeBidirectional = false
350434
* When edgeBidirectional=false is specified, the edge type should be created as non-bidirectional
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
Id,Name,Color,Engine,Seats,Doors,Sunroof,GPS
2+
1,Tesla Model 3,,Electric,5,,,true
3+
2,Ford Mustang,Red,V8,,2,,
4+
3,Toyota Prius,,Hybrid,5,4,,

0 commit comments

Comments
 (0)