// Any copyright is dedicated to the Public Domain. // http://creativecommons.org/publicdomain/zero/1.0/ // Not intended to be fast, but if you can make it faster, please help out! var WebVTTParser = function() { this.parse = function(input, mode) { //XXX need global search and replace for \0 var NEWLINE = /\r\n|\r|\n/, startTime = Date.now(), linePos = 0, lines = input.split(NEWLINE), alreadyCollected = false, cues = [], errors = [] function err(message, col) { errors.push({message:message, line:linePos+1, col:col}) } var line = lines[linePos], lineLength = line.length, signature = "WEBVTT", bom = 0, signature_length = signature.length /* Byte order mark */ if (line[0] === "\ufeff") { bom = 1 signature_length += 1 } /* SIGNATURE */ if ( lineLength < signature_length || line.indexOf(signature) !== 0+bom || lineLength > signature_length && line[signature_length] !== " " && line[signature_length] !== "\t" ) { err("No valid signature. (File needs to start with \"WEBVTT\".)") } linePos++ /* HEADER */ while(lines[linePos] != "" && lines[linePos] != undefined) { err("No blank line after the signature.") if(lines[linePos].indexOf("-->") != -1) { alreadyCollected = true break } linePos++ } /* CUE LOOP */ while(lines[linePos] != undefined) { var cue while(!alreadyCollected && lines[linePos] == "") { linePos++ } if(!alreadyCollected && lines[linePos] == undefined) break /* CUE CREATION */ cue = { id:"", startTime:0, endTime:0, pauseOnExit:false, direction:"horizontal", snapToLines:true, linePosition:"auto", textPosition:50, size:100, alignment:"middle", text:"", tree:null } var parseTimings = true if(lines[linePos].indexOf("-->") == -1) { cue.id = lines[linePos] /* COMMENTS Not part of the specification's parser as these would just be ignored. However, we want them to be conforming and not get "Cue identifier cannot be standalone". */ if(/^NOTE($|[ \t])/.test(cue.id)) { // .startsWith fails in Chrome linePos++ while(lines[linePos] != "" && lines[linePos] != undefined) { if(lines[linePos].indexOf("-->") != -1) err("Cannot have timestamp in a comment.") linePos++ } continue } linePos++ if(lines[linePos] == "" || lines[linePos] == undefined) { err("Cue identifier cannot be standalone.") continue } if(lines[linePos].indexOf("-->") == -1) { parseTimings = false err("Cue identifier needs to be followed by timestamp.") } } /* TIMINGS */ alreadyCollected = false var timings = new WebVTTCueTimingsAndSettingsParser(lines[linePos], err) var previousCueStart = 0 if(cues.length > 0) { previousCueStart = cues[cues.length-1].startTime } if(parseTimings && !timings.parse(cue, previousCueStart)) { /* BAD CUE */ cue = null linePos++ /* BAD CUE LOOP */ while(lines[linePos] != "" && lines[linePos] != undefined) { if(lines[linePos].indexOf("-->") != -1) { alreadyCollected = true break } linePos++ } continue } linePos++ /* CUE TEXT LOOP */ while(lines[linePos] != "" && lines[linePos] != undefined) { if(lines[linePos].indexOf("-->") != -1) { err("Blank line missing before cue.") alreadyCollected = true break } if(cue.text != "") cue.text += "\n" cue.text += lines[linePos] linePos++ } /* CUE TEXT PROCESSING */ var cuetextparser = new WebVTTCueTextParser(cue.text, err, mode) cue.tree = cuetextparser.parse(cue.startTime, cue.endTime) cues.push(cue) } cues.sort(function(a, b) { if (a.startTime < b.startTime) return -1 if (a.startTime > b.startTime) return 1 if (a.endTime > b.endTime) return -1 if (a.endTime < b.endTime) return 1 return 0 }) /* END */ return {cues:cues, errors:errors, time:Date.now()-startTime} } } var WebVTTCueTimingsAndSettingsParser = function(line, errorHandler) { var SPACE = /[\u0020\t\f]/, NOSPACE = /[^\u0020\t\f]/, line = line, pos = 0, err = function(message) { errorHandler(message, pos+1) }, spaceBeforeSetting = true function skip(pattern) { while( line[pos] != undefined && pattern.test(line[pos]) ) { pos++ } } function collect(pattern) { var str = "" while( line[pos] != undefined && pattern.test(line[pos]) ) { str += line[pos] pos++ } return str } /* http://dev.w3.org/html5/webvtt/#collect-a-webvtt-timestamp */ function timestamp() { var units = "minutes", val1, val2, val3, val4 // 3 if(line[pos] == undefined) { err("No timestamp found.") return } // 4 if(!/\d/.test(line[pos])) { err("Timestamp must start with a character in the range 0-9.") return } // 5-7 val1 = collect(/\d/) if(val1.length > 2 || parseInt(val1, 10) > 59) { units = "hours" } // 8 if(line[pos] != ":") { err("No time unit separator found.") return } pos++ // 9-11 val2 = collect(/\d/) if(val2.length != 2) { err("Must be exactly two digits.") return } // 12 if(units == "hours" || line[pos] == ":") { if(line[pos] != ":") { err("No seconds found or minutes is greater than 59.") return } pos++ val3 = collect(/\d/) if(val3.length != 2) { err("Must be exactly two digits.") return } } else { val3 = val2 val2 = val1 val1 = "0" } // 13 if(line[pos] != ".") { err("No decimal separator (\".\") found.") return } pos++ // 14-16 val4 = collect(/\d/) if(val4.length != 3) { err("Milliseconds must be given in three digits.") return } // 17 if(parseInt(val2, 10) > 59) { err("You cannot have more than 59 minutes.") return } if(parseInt(val3, 10) > 59) { err("You cannot have more than 59 seconds.") return } return parseInt(val1, 10) * 60 * 60 + parseInt(val2, 10) * 60 + parseInt(val3, 10) + parseInt(val4, 10) / 1000 } /* http://dev.w3.org/html5/webvtt/#parse-the-webvtt-settings */ function parseSettings(input, cue) { var settings = input.split(SPACE), seen = [] for(var i=0; i < settings.length; i++) { if(settings[i] == "") continue var index = settings[i].indexOf(':'), setting = settings[i].slice(0, index) value = settings[i].slice(index + 1) if(seen.indexOf(setting) != -1) { err("Duplicate setting.") } seen.push(setting) if(value == "") { err("No value for setting defined.") return } if(setting == "vertical") { // writing direction if(value != "rl" && value != "lr") { err("Writing direction can only be set to 'rl' or 'rl'.") continue } cue.direction = value } else if(setting == "line") { // line position if(!/\d/.test(value)) { err("Line position takes a number or percentage.") continue } if(value.indexOf("-", 1) != -1) { err("Line position can only have '-' at the start.") continue } if(value.indexOf("%") != -1 && value.indexOf("%") != value.length-1) { err("Line position can only have '%' at the end.") continue } if(value[0] == "-" && value[value.length-1] == "%") { err("Line position cannot be a negative percentage.") continue } if(value[value.length-1] == "%") { if(parseInt(value, 10) > 100) { err("Line position cannot be >100%.") continue } cue.snapToLines = false } cue.linePosition = parseInt(value, 10) } else if(setting == "position") { // text position if(value[value.length-1] != "%") { err("Text position must be a percentage.") continue } if(parseInt(value, 10) > 100) { err("Size cannot be >100%.") continue } cue.textPosition = parseInt(value, 10) } else if(setting == "size") { // size if(value[value.length-1] != "%") { err("Size must be a percentage.") continue } if(parseInt(value, 10) > 100) { err("Size cannot be >100%.") continue } cue.size = parseInt(value, 10) } else if(setting == "align") { // alignment var alignValues = ["start", "middle", "end", "left", "right"] if(alignValues.indexOf(value) == -1) { err("Alignment can only be set to one of " + alignValues.join(", ") + ".") continue } cue.alignment = value } else { err("Invalid setting.") } } } this.parse = function(cue, previousCueStart) { skip(SPACE) cue.startTime = timestamp() if(cue.startTime == undefined) { return } if(cue.startTime < previousCueStart) { err("Start timestamp is not greater than or equal to start timestamp of previous cue.") } if(NOSPACE.test(line[pos])) { err("Timestamp not separated from '-->' by whitespace.") } skip(SPACE) // 6-8 if(line[pos] != "-") { err("No valid timestamp separator found.") return } pos++ if(line[pos] != "-") { err("No valid timestamp separator found.") return } pos++ if(line[pos] != ">") { err("No valid timestamp separator found.") return } pos++ if(NOSPACE.test(line[pos])) { err("'-->' not separated from timestamp by whitespace.") } skip(SPACE) cue.endTime = timestamp() if(cue.endTime == undefined) { return } if(cue.endTime <= cue.startTime) { err("End timestamp is not greater than start timestamp.") } if(NOSPACE.test(line[pos])) { spaceBeforeSetting = false } skip(SPACE) parseSettings(line.substring(pos), cue) return true } this.parseTimestamp = function() { var ts = timestamp() if(line[pos] != undefined) { err("Timestamp must not have trailing characters.") return } return ts } } var WebVTTCueTextParser = function(line, errorHandler, mode) { var line = line, pos = 0, err = function(message) { if(mode == "metadata") return errorHandler(message, pos+1) } this.parse = function(cueStart, cueEnd) { var result = {children:[]}, current = result, timestamps = [] function attach(token) { current.children.push({type:"object", name:token[1], classes:token[2], children:[], parent:current}) current = current.children[current.children.length-1] } function inScope(name) { var node = current while(node) { if(node.name == name) return true node = node.parent } return } while(line[pos] != undefined) { var token = nextToken() if(token[0] == "text") { current.children.push({type:"text", value:token[1], parent:current}) } else if(token[0] == "start tag") { if(mode == "chapters") err("Start tags not allowed in chapter title text.") var name = token[1] if(name != "v" && name != "lang" && token[3] != "") { err("Only and can have an annotation.") } if( name == "c" || name == "i" || name == "b" || name == "u" || name == "ruby" ) { attach(token) } else if(name == "rt" && current.name == "ruby") { attach(token) } else if(name == "v") { if(inScope("v")) { err(" cannot be nested inside itself.") } attach(token) current.value = token[3] // annotation if(!token[3]) { err(" requires an annotation.") } } else if(name == "lang") { attach(token) current.value = token[3] // language } else { err("Incorrect start tag.") } } else if(token[0] == "end tag") { if(mode == "chapters") err("End tags not allowed in chapter title text.") // XXX check content if(token[1] == current.name) { current = current.parent } else if(token[1] == "ruby" && current.name == "rt") { current = current.parent.parent } else { err("Incorrect end tag.") } } else if(token[0] == "timestamp") { if(mode == "chapters") err("Timestamp not allowed in chapter title text.") var timings = new WebVTTCueTimingsAndSettingsParser(token[1], err), timestamp = timings.parseTimestamp() if(timestamp != undefined) { if(timestamp <= cueStart || timestamp >= cueEnd) { err("Timestamp must be between start timestamp and end timestamp.") } if(timestamps.length > 0 && timestamps[timestamps.length-1] >= timestamp) { err("Timestamp must be greater than any previous timestamp.") } current.children.push({type:"timestamp", value:timestamp, parent:current}) timestamps.push(timestamp) } } } while(current.parent) { if(current.name != "v") { err("Required end tag missing.") } current = current.parent } return result } function nextToken() { var state = "data", result = "", buffer = "", classes = [] while(line[pos-1] != undefined || pos == 0) { var c = line[pos] if(state == "data") { if(c == "&") { buffer = c state = "escape" } else if(c == "<" && result == "") { state = "tag" } else if(c == "<" || c == undefined) { return ["text", result] } else { result += c } } else if(state == "escape") { if(c == "&") { err("Incorrect escape.") result += buffer buffer = c } else if(/[abglmnsprt]/.test(c)) { buffer += c } else if(c == ";") { if(buffer == "&") { result += "&" } else if(buffer == "<") { result += "<" } else if(buffer == ">") { result += ">" } else if(buffer == "&lrm") { result += "\u200e" } else if(buffer == "&rlm") { result += "\u200f" } else if(buffer == " ") { result += "\u00A0" } else { err("Incorrect escape.") result += buffer + ";" } state = "data" } else if(c == "<" || c == undefined) { err("Incorrect escape.") result += buffer return ["text", result] } else { err("Incorrect escape.") result += buffer + c state = "data" } } else if(state == "tag") { if(c == "\t" || c == "\n" || c == "\f" || c == " ") { state = "start tag annotation" } else if(c == ".") { state = "start tag class" } else if(c == "/") { state = "end tag" } else if(/\d/.test(c)) { result = c state = "timestamp tag" } else if(c == ">" || c == undefined) { if(c == ">") { pos++ } return ["start tag", "", [], ""] } else { result = c state = "start tag" } } else if(state == "start tag") { if(c == "\t" || c == "\f" || c == " ") { state = "start tag annotation" } else if(c == "\n") { buffer = c state = "start tag annotation" } else if(c == ".") { state = "start tag class" } else if(c == ">" || c == undefined) { if(c == ">") { pos++ } return ["start tag", result, [], ""] } else { result += c } } else if(state == "start tag class") { if(c == "\t" || c == "\f" || c == " ") { classes.push(buffer) buffer = "" state = "start tag annotation" } else if(c == "\n") { classes.push(buffer) buffer = c state = "start tag annotation" } else if(c == ".") { classes.push(buffer) buffer = "" } else if(c == ">" || c == undefined) { if(c == ">") { pos++ } classes.push(buffer) return ["start tag", result, classes, ""] } else { buffer += c } } else if(state == "start tag annotation") { if(c == ">" || c == undefined) { if(c == ">") { pos++ } buffer = buffer.split(/[\u0020\t\f\r\n]+/).filter(function(item) { if(item) return true }).join(" ") return ["start tag", result, classes, buffer] } else { buffer +=c } } else if(state == "end tag") { if(c == ">" || c == undefined) { if(c == ">") { pos++ } return ["end tag", result] } else { result += c } } else if(state == "timestamp tag") { if(c == ">" || c == undefined) { if(c == ">") { pos++ } return ["timestamp", result] } else { result += c } } else { err("Never happens.") // The joke is it might. } // 8 pos++ } } } var WebVTTSerializer = function() { function serializeTree(tree) { var result = "" for (var i = 0; i < tree.length; i++) { var node = tree[i] if(node.type == "text") { result += node.value } else if(node.type == "object") { result += "<" + node.name if(node.classes) { for(var y = 0; y < node.classes.length; y++) { result += "." + node.classes[y] } } if(node.value) { result += " " + node.value } result += ">" if(node.children) result += serializeTree(node.children) result += "" } else { result += "<" + node.value + ">" } } return result } function serializeCue(cue) { return cue.startTime + " " + cue.endTime + "\n" + serializeTree(cue.tree.children) + "\n\n" } this.serialize = function(cues) { var result = "" for(var i=0;i