Sync your data
How to sync data from external data sources.
Afterdefining the content types, you can start syncing the data. However, the data must first be formatted asJSON Lines for Optimizely Graph to ingest it.
📘
NoteThis tutorial uses thenon-commercial datasets of IMDb.
The following fields are required to be set, so you can find results when you query after syncing:
ContentType– The values ofContent Types.Status– Field set to one of the following:Draft– Not exposed or available with the public key.Published– Available with the public key.
_rbac– Uses role-based access to hide items. To make the data available for everyone, set the value tor:Everyone:Read.__typename– Needed tosupport inline fragments, where the value must be the implemented content type.
📘
Note
__typenameis prefixed with two underscores.
Each item you send to Optimizely Graph needs to be prefaced with a line in JSON that sets the_id (unique ID belonging to the item) and thelanguage_routing. For this tutorial,language_routing is to"en" because that is what was configured in the Content Types'languages.
You can sync the data from the TSV files toOptimizely Graph (POST /api/content/v2/data) using the following Python code:
#!/usr/bin/env pythonimport csvimport jsonimport collectionsimport requests as requestsOrderedDict = collections.OrderedDictSOURCE = "imdb"DATA_SYNC_ENDPOINT = "https://cg.optimizely.com/api/content/v2/data?id={}".format(SOURCE)HEADERS = { 'Content-Type': 'text/plain', 'Authorization': 'Basic <Token>'}NAME_BASICS_FILE = 'data/name.basics.small.tsv'TITLE_BASICS_FILE = 'data/title.basics.small.tsv'TITLE_RATINGS_FILE = 'data/title.ratings.small.tsv'STRING_ARRAY_FIELDS = ["ContentType", "knownForTitles", "primaryProfession___searchable", "genres___searchable"]INT_FIELDS = ["birthYear", "deathYear", "startYear", "endYear", "runtimeMinutes", "numVotes"]FLOAT_FIELDS = ["averageRating"]BOOLEAN_FIELDS = ["isAdult"]def load_data(source, content_type): data = [] with open(source, 'r') as csvfile: reader = csv.reader(csvfile, delimiter='\t', quotechar='"') header = next(reader) header.append("ContentType") header.append("Status") header.append("_rbac") header.append("__typename") count = 0 idx = 0 for (is_last_check, row) in is_last(reader): for i, value in enumerate(row): if header[i] in STRING_ARRAY_FIELDS: row[i] = value.split(",") if "," in value else [value] elif header[i] in INT_FIELDS: row[i] = int(value) if value != "\\N" else None elif header[i] in FLOAT_FIELDS: row[i] = float(value) if value != "\\N" else None elif header[i] in BOOLEAN_FIELDS: row[i] = value.lower() in ["1"] row.append(["Record", content_type]) row.append("Published") row.append("r:Everyone:Read") row.append(content_type) data.append(OrderedDict(zip(header, row))) count += 1 idx += 1 if count == 100 or is_last_check: count = 0 bulk = '\n'.join( "{\"index\": { \"_id\": \"" + source + str(idx + i) + "\", \"language_routing\": \"en\" }}\n" + json.dumps(v) for (i, v) in enumerate(data)) response = requests.request("POST", DATA_SYNC_ENDPOINT, headers=HEADERS, data=bulk) print(response.text) data = []def is_last(itr): old = next(itr) for new in itr: yield False, old old = new yield True, oldload_data(NAME_BASICS_FILE, "Actor")load_data(TITLE_BASICS_FILE, "Title")load_data(TITLE_RATINGS_FILE, "Rating")A bulk request will look like the following:
{"index": { "_id": "data/name.basics.small.tsv1000", "language_routing": "en" }}{"nconst": "nm0000001", "primaryName___searchable": "Fred Astaire", "birthYear": 1899, "deathYear": 1987, "primaryProfession___searchable": ["soundtrack", "actor", "miscellaneous"], "knownForTitles": ["tt0050419", "tt0031983", "tt0053137", "tt0072308"], "ContentType": ["Record", "Actor"], "Status": "Published", "_rbac": "r:Everyone:Read", "__typename": "Actor"}{"index": { "_id": "data/name.basics.small.tsv1001", "language_routing": "en" }}{"nconst": "nm0000002", "primaryName___searchable": "Lauren Bacall", "birthYear": 1924, "deathYear": 2014, "primaryProfession___searchable": ["actress", "soundtrack"], "knownForTitles": ["tt0075213", "tt0037382", "tt0117057", "tt0038355"], "ContentType": ["Record", "Actor"], "Status": "Published", "_rbac": "r:Everyone:Read", "__typename": "Actor"}Updated 2 months ago
