Home Reference Source

src/remux/mp4-remuxer.ts

  1. import AAC from './aac-helper';
  2. import MP4 from './mp4-generator';
  3. import type { HlsEventEmitter } from '../events';
  4. import { Events } from '../events';
  5. import { ErrorTypes, ErrorDetails } from '../errors';
  6. import { logger } from '../utils/logger';
  7. import {
  8. InitSegmentData,
  9. Remuxer,
  10. RemuxerResult,
  11. RemuxedMetadata,
  12. RemuxedTrack,
  13. RemuxedUserdata,
  14. } from '../types/remuxer';
  15. import { PlaylistLevelType } from '../types/loader';
  16. import { toMsFromMpegTsClock } from '../utils/timescale-conversion';
  17. import type {
  18. AudioSample,
  19. AvcSample,
  20. DemuxedAudioTrack,
  21. DemuxedAvcTrack,
  22. DemuxedMetadataTrack,
  23. DemuxedUserdataTrack,
  24. } from '../types/demuxer';
  25. import type { TrackSet } from '../types/track';
  26. import type { SourceBufferName } from '../types/buffer';
  27. import type { Fragment } from '../loader/fragment';
  28. import type { HlsConfig } from '../config';
  29.  
  30. const MAX_SILENT_FRAME_DURATION = 10 * 1000; // 10 seconds
  31. const AAC_SAMPLES_PER_FRAME = 1024;
  32. const MPEG_AUDIO_SAMPLE_PER_FRAME = 1152;
  33.  
  34. let chromeVersion: number | null = null;
  35. let safariWebkitVersion: number | null = null;
  36. let requiresPositiveDts: boolean = false;
  37.  
  38. export default class MP4Remuxer implements Remuxer {
  39. private observer: HlsEventEmitter;
  40. private config: HlsConfig;
  41. private typeSupported: any;
  42. private ISGenerated: boolean = false;
  43. private _initPTS!: number;
  44. private _initDTS!: number;
  45. private nextAvcDts: number | null = null;
  46. private nextAudioPts: number | null = null;
  47. private isAudioContiguous: boolean = false;
  48. private isVideoContiguous: boolean = false;
  49.  
  50. constructor(
  51. observer: HlsEventEmitter,
  52. config: HlsConfig,
  53. typeSupported,
  54. vendor = ''
  55. ) {
  56. this.observer = observer;
  57. this.config = config;
  58. this.typeSupported = typeSupported;
  59. this.ISGenerated = false;
  60.  
  61. if (chromeVersion === null) {
  62. const userAgent = navigator.userAgent || '';
  63. const result = userAgent.match(/Chrome\/(\d+)/i);
  64. chromeVersion = result ? parseInt(result[1]) : 0;
  65. }
  66. if (safariWebkitVersion === null) {
  67. const result = navigator.userAgent.match(/Safari\/(\d+)/i);
  68. safariWebkitVersion = result ? parseInt(result[1]) : 0;
  69. }
  70. requiresPositiveDts =
  71. (!!chromeVersion && chromeVersion < 75) ||
  72. (!!safariWebkitVersion && safariWebkitVersion < 600);
  73. }
  74.  
  75. destroy() {}
  76.  
  77. resetTimeStamp(defaultTimeStamp) {
  78. logger.log('[mp4-remuxer]: initPTS & initDTS reset');
  79. this._initPTS = this._initDTS = defaultTimeStamp;
  80. }
  81.  
  82. resetNextTimestamp() {
  83. logger.log('[mp4-remuxer]: reset next timestamp');
  84. this.isVideoContiguous = false;
  85. this.isAudioContiguous = false;
  86. }
  87.  
  88. resetInitSegment() {
  89. logger.log('[mp4-remuxer]: ISGenerated flag reset');
  90. this.ISGenerated = false;
  91. }
  92.  
  93. getVideoStartPts(videoSamples) {
  94. let rolloverDetected = false;
  95. const startPTS = videoSamples.reduce((minPTS, sample) => {
  96. const delta = sample.pts - minPTS;
  97. if (delta < -4294967296) {
  98. // 2^32, see PTSNormalize for reasoning, but we're hitting a rollover here, and we don't want that to impact the timeOffset calculation
  99. rolloverDetected = true;
  100. return normalizePts(minPTS, sample.pts);
  101. } else if (delta > 0) {
  102. return minPTS;
  103. } else {
  104. return sample.pts;
  105. }
  106. }, videoSamples[0].pts);
  107. if (rolloverDetected) {
  108. logger.debug('PTS rollover detected');
  109. }
  110. return startPTS;
  111. }
  112.  
  113. remux(
  114. audioTrack: DemuxedAudioTrack,
  115. videoTrack: DemuxedAvcTrack,
  116. id3Track: DemuxedMetadataTrack,
  117. textTrack: DemuxedUserdataTrack,
  118. timeOffset: number,
  119. accurateTimeOffset: boolean,
  120. flush: boolean,
  121. playlistType: PlaylistLevelType
  122. ): RemuxerResult {
  123. let video: RemuxedTrack | undefined;
  124. let audio: RemuxedTrack | undefined;
  125. let initSegment: InitSegmentData | undefined;
  126. let text: RemuxedUserdata | undefined;
  127. let id3: RemuxedMetadata | undefined;
  128. let independent: boolean | undefined;
  129. let audioTimeOffset = timeOffset;
  130. let videoTimeOffset = timeOffset;
  131.  
  132. // If we're remuxing audio and video progressively, wait until we've received enough samples for each track before proceeding.
  133. // This is done to synchronize the audio and video streams. We know if the current segment will have samples if the "pid"
  134. // parameter is greater than -1. The pid is set when the PMT is parsed, which contains the tracks list.
  135. // However, if the initSegment has already been generated, or we've reached the end of a segment (flush),
  136. // then we can remux one track without waiting for the other.
  137. const hasAudio = audioTrack.pid > -1;
  138. const hasVideo = videoTrack.pid > -1;
  139. const length = videoTrack.samples.length;
  140. const enoughAudioSamples = audioTrack.samples.length > 0;
  141. const enoughVideoSamples = length > 1;
  142. const canRemuxAvc =
  143. ((!hasAudio || enoughAudioSamples) &&
  144. (!hasVideo || enoughVideoSamples)) ||
  145. this.ISGenerated ||
  146. flush;
  147.  
  148. if (canRemuxAvc) {
  149. if (!this.ISGenerated) {
  150. initSegment = this.generateIS(audioTrack, videoTrack, timeOffset);
  151. }
  152.  
  153. const isVideoContiguous = this.isVideoContiguous;
  154. let firstKeyFrameIndex = -1;
  155.  
  156. if (enoughVideoSamples) {
  157. firstKeyFrameIndex = findKeyframeIndex(videoTrack.samples);
  158. if (!isVideoContiguous && this.config.forceKeyFrameOnDiscontinuity) {
  159. independent = true;
  160. if (firstKeyFrameIndex > 0) {
  161. logger.warn(
  162. `[mp4-remuxer]: Dropped ${firstKeyFrameIndex} out of ${length} video samples due to a missing keyframe`
  163. );
  164. const startPTS = this.getVideoStartPts(videoTrack.samples);
  165. videoTrack.samples = videoTrack.samples.slice(firstKeyFrameIndex);
  166. videoTrack.dropped += firstKeyFrameIndex;
  167. videoTimeOffset +=
  168. (videoTrack.samples[0].pts - startPTS) /
  169. (videoTrack.timescale || 90000);
  170. } else if (firstKeyFrameIndex === -1) {
  171. logger.warn(
  172. `[mp4-remuxer]: No keyframe found out of ${length} video samples`
  173. );
  174. independent = false;
  175. }
  176. }
  177. }
  178.  
  179. if (this.ISGenerated) {
  180. if (enoughAudioSamples && enoughVideoSamples) {
  181. // timeOffset is expected to be the offset of the first timestamp of this fragment (first DTS)
  182. // if first audio DTS is not aligned with first video DTS then we need to take that into account
  183. // when providing timeOffset to remuxAudio / remuxVideo. if we don't do that, there might be a permanent / small
  184. // drift between audio and video streams
  185. const startPTS = this.getVideoStartPts(videoTrack.samples);
  186. const tsDelta =
  187. normalizePts(audioTrack.samples[0].pts, startPTS) - startPTS;
  188. const audiovideoTimestampDelta = tsDelta / videoTrack.inputTimeScale;
  189. audioTimeOffset += Math.max(0, audiovideoTimestampDelta);
  190. videoTimeOffset += Math.max(0, -audiovideoTimestampDelta);
  191. }
  192.  
  193. // Purposefully remuxing audio before video, so that remuxVideo can use nextAudioPts, which is calculated in remuxAudio.
  194. if (enoughAudioSamples) {
  195. // if initSegment was generated without audio samples, regenerate it again
  196. if (!audioTrack.samplerate) {
  197. logger.warn(
  198. '[mp4-remuxer]: regenerate InitSegment as audio detected'
  199. );
  200. initSegment = this.generateIS(audioTrack, videoTrack, timeOffset);
  201. }
  202. audio = this.remuxAudio(
  203. audioTrack,
  204. audioTimeOffset,
  205. this.isAudioContiguous,
  206. accurateTimeOffset,
  207. hasVideo ||
  208. enoughVideoSamples ||
  209. playlistType === PlaylistLevelType.AUDIO
  210. ? videoTimeOffset
  211. : undefined
  212. );
  213. if (enoughVideoSamples) {
  214. const audioTrackLength = audio ? audio.endPTS - audio.startPTS : 0;
  215. // if initSegment was generated without video samples, regenerate it again
  216. if (!videoTrack.inputTimeScale) {
  217. logger.warn(
  218. '[mp4-remuxer]: regenerate InitSegment as video detected'
  219. );
  220. initSegment = this.generateIS(audioTrack, videoTrack, timeOffset);
  221. }
  222. video = this.remuxVideo(
  223. videoTrack,
  224. videoTimeOffset,
  225. isVideoContiguous,
  226. audioTrackLength
  227. );
  228. }
  229. } else if (enoughVideoSamples) {
  230. video = this.remuxVideo(
  231. videoTrack,
  232. videoTimeOffset,
  233. isVideoContiguous,
  234. 0
  235. );
  236. }
  237. if (video) {
  238. video.firstKeyFrame = firstKeyFrameIndex;
  239. video.independent = firstKeyFrameIndex !== -1;
  240. }
  241. }
  242. }
  243.  
  244. // Allow ID3 and text to remux, even if more audio/video samples are required
  245. if (this.ISGenerated) {
  246. if (id3Track.samples.length) {
  247. id3 = flushTextTrackMetadataCueSamples(
  248. id3Track,
  249. timeOffset,
  250. this._initPTS,
  251. this._initDTS
  252. );
  253. }
  254.  
  255. if (textTrack.samples.length) {
  256. text = flushTextTrackUserdataCueSamples(
  257. textTrack,
  258. timeOffset,
  259. this._initPTS
  260. );
  261. }
  262. }
  263.  
  264. return {
  265. audio,
  266. video,
  267. initSegment,
  268. independent,
  269. text,
  270. id3,
  271. };
  272. }
  273.  
  274. generateIS(
  275. audioTrack: DemuxedAudioTrack,
  276. videoTrack: DemuxedAvcTrack,
  277. timeOffset
  278. ): InitSegmentData | undefined {
  279. const audioSamples = audioTrack.samples;
  280. const videoSamples = videoTrack.samples;
  281. const typeSupported = this.typeSupported;
  282. const tracks: TrackSet = {};
  283. const computePTSDTS = !Number.isFinite(this._initPTS);
  284. let container = 'audio/mp4';
  285. let initPTS: number | undefined;
  286. let initDTS: number | undefined;
  287. let timescale: number | undefined;
  288.  
  289. if (computePTSDTS) {
  290. initPTS = initDTS = Infinity;
  291. }
  292.  
  293. if (audioTrack.config && audioSamples.length) {
  294. // let's use audio sampling rate as MP4 time scale.
  295. // rationale is that there is a integer nb of audio frames per audio sample (1024 for AAC)
  296. // using audio sampling rate here helps having an integer MP4 frame duration
  297. // this avoids potential rounding issue and AV sync issue
  298. audioTrack.timescale = audioTrack.samplerate;
  299. switch (audioTrack.segmentCodec) {
  300. case 'mp3':
  301. if (typeSupported.mpeg) {
  302. // Chrome and Safari
  303. container = 'audio/mpeg';
  304. audioTrack.codec = '';
  305. } else if (typeSupported.mp3) {
  306. // Firefox
  307. audioTrack.codec = 'mp3';
  308. }
  309. break;
  310. }
  311. tracks.audio = {
  312. id: 'audio',
  313. container: container,
  314. codec: audioTrack.codec,
  315. initSegment:
  316. audioTrack.segmentCodec === 'mp3' && typeSupported.mpeg
  317. ? new Uint8Array(0)
  318. : MP4.initSegment([audioTrack]),
  319. metadata: {
  320. channelCount: audioTrack.channelCount,
  321. },
  322. };
  323. if (computePTSDTS) {
  324. timescale = audioTrack.inputTimeScale;
  325. // remember first PTS of this demuxing context. for audio, PTS = DTS
  326. initPTS = initDTS =
  327. audioSamples[0].pts - Math.round(timescale * timeOffset);
  328. }
  329. }
  330.  
  331. if (videoTrack.sps && videoTrack.pps && videoSamples.length) {
  332. // let's use input time scale as MP4 video timescale
  333. // we use input time scale straight away to avoid rounding issues on frame duration / cts computation
  334. videoTrack.timescale = videoTrack.inputTimeScale;
  335. tracks.video = {
  336. id: 'main',
  337. container: 'video/mp4',
  338. codec: videoTrack.codec,
  339. initSegment: MP4.initSegment([videoTrack]),
  340. metadata: {
  341. width: videoTrack.width,
  342. height: videoTrack.height,
  343. },
  344. };
  345. if (computePTSDTS) {
  346. timescale = videoTrack.inputTimeScale;
  347. const startPTS = this.getVideoStartPts(videoSamples);
  348. const startOffset = Math.round(timescale * timeOffset);
  349. initDTS = Math.min(
  350. initDTS as number,
  351. normalizePts(videoSamples[0].dts, startPTS) - startOffset
  352. );
  353. initPTS = Math.min(initPTS as number, startPTS - startOffset);
  354. }
  355. }
  356.  
  357. if (Object.keys(tracks).length) {
  358. this.ISGenerated = true;
  359. if (computePTSDTS) {
  360. this._initPTS = initPTS as number;
  361. this._initDTS = initDTS as number;
  362. }
  363.  
  364. return {
  365. tracks,
  366. initPTS,
  367. timescale,
  368. };
  369. }
  370. }
  371.  
  372. remuxVideo(
  373. track: DemuxedAvcTrack,
  374. timeOffset: number,
  375. contiguous: boolean,
  376. audioTrackLength: number
  377. ): RemuxedTrack | undefined {
  378. const timeScale: number = track.inputTimeScale;
  379. const inputSamples: Array<AvcSample> = track.samples;
  380. const outputSamples: Array<Mp4Sample> = [];
  381. const nbSamples: number = inputSamples.length;
  382. const initPTS: number = this._initPTS;
  383. let nextAvcDts = this.nextAvcDts;
  384. let offset = 8;
  385. let mp4SampleDuration!: number;
  386. let firstDTS;
  387. let lastDTS;
  388. let minPTS: number = Number.POSITIVE_INFINITY;
  389. let maxPTS: number = Number.NEGATIVE_INFINITY;
  390. let ptsDtsShift = 0;
  391. let sortSamples = false;
  392.  
  393. // if parsed fragment is contiguous with last one, let's use last DTS value as reference
  394. if (!contiguous || nextAvcDts === null) {
  395. const pts = timeOffset * timeScale;
  396. const cts =
  397. inputSamples[0].pts -
  398. normalizePts(inputSamples[0].dts, inputSamples[0].pts);
  399. // if not contiguous, let's use target timeOffset
  400. nextAvcDts = pts - cts;
  401. }
  402.  
  403. // PTS is coded on 33bits, and can loop from -2^32 to 2^32
  404. // PTSNormalize will make PTS/DTS value monotonic, we use last known DTS value as reference value
  405. for (let i = 0; i < nbSamples; i++) {
  406. const sample = inputSamples[i];
  407. sample.pts = normalizePts(sample.pts - initPTS, nextAvcDts);
  408. sample.dts = normalizePts(sample.dts - initPTS, nextAvcDts);
  409. if (sample.dts > sample.pts) {
  410. const PTS_DTS_SHIFT_TOLERANCE_90KHZ = 90000 * 0.2;
  411. ptsDtsShift = Math.max(
  412. Math.min(ptsDtsShift, sample.pts - sample.dts),
  413. -1 * PTS_DTS_SHIFT_TOLERANCE_90KHZ
  414. );
  415. }
  416. if (sample.dts < inputSamples[i > 0 ? i - 1 : i].dts) {
  417. sortSamples = true;
  418. }
  419. }
  420.  
  421. // sort video samples by DTS then PTS then demux id order
  422. if (sortSamples) {
  423. inputSamples.sort(function (a, b) {
  424. const deltadts = a.dts - b.dts;
  425. const deltapts = a.pts - b.pts;
  426. return deltadts || deltapts;
  427. });
  428. }
  429.  
  430. // Get first/last DTS
  431. firstDTS = inputSamples[0].dts;
  432. lastDTS = inputSamples[inputSamples.length - 1].dts;
  433.  
  434. // on Safari let's signal the same sample duration for all samples
  435. // sample duration (as expected by trun MP4 boxes), should be the delta between sample DTS
  436. // set this constant duration as being the avg delta between consecutive DTS.
  437. const averageSampleDuration = Math.round(
  438. (lastDTS - firstDTS) / (nbSamples - 1)
  439. );
  440.  
  441. // handle broken streams with PTS < DTS, tolerance up 0.2 seconds
  442. if (ptsDtsShift < 0) {
  443. if (ptsDtsShift < averageSampleDuration * -2) {
  444. // Fix for "CNN special report, with CC" in test-streams (including Safari browser)
  445. // With large PTS < DTS errors such as this, we want to correct CTS while maintaining increasing DTS values
  446. logger.warn(
  447. `PTS < DTS detected in video samples, offsetting DTS from PTS by ${toMsFromMpegTsClock(
  448. -averageSampleDuration,
  449. true
  450. )} ms`
  451. );
  452. let lastDts = ptsDtsShift;
  453. for (let i = 0; i < nbSamples; i++) {
  454. inputSamples[i].dts = lastDts = Math.max(
  455. lastDts,
  456. inputSamples[i].pts - averageSampleDuration
  457. );
  458. inputSamples[i].pts = Math.max(lastDts, inputSamples[i].pts);
  459. }
  460. } else {
  461. // Fix for "Custom IV with bad PTS DTS" in test-streams
  462. // With smaller PTS < DTS errors we can simply move all DTS back. This increases CTS without causing buffer gaps or decode errors in Safari
  463. logger.warn(
  464. `PTS < DTS detected in video samples, shifting DTS by ${toMsFromMpegTsClock(
  465. ptsDtsShift,
  466. true
  467. )} ms to overcome this issue`
  468. );
  469. for (let i = 0; i < nbSamples; i++) {
  470. inputSamples[i].dts = inputSamples[i].dts + ptsDtsShift;
  471. }
  472. }
  473. firstDTS = inputSamples[0].dts;
  474. }
  475.  
  476. // if fragment are contiguous, detect hole/overlapping between fragments
  477. if (contiguous) {
  478. // check timestamp continuity across consecutive fragments (this is to remove inter-fragment gap/hole)
  479. const delta = firstDTS - nextAvcDts;
  480. const foundHole = delta > averageSampleDuration;
  481. const foundOverlap = delta < -1;
  482. if (foundHole || foundOverlap) {
  483. if (foundHole) {
  484. logger.warn(
  485. `AVC: ${toMsFromMpegTsClock(
  486. delta,
  487. true
  488. )} ms (${delta}dts) hole between fragments detected, filling it`
  489. );
  490. } else {
  491. logger.warn(
  492. `AVC: ${toMsFromMpegTsClock(
  493. -delta,
  494. true
  495. )} ms (${delta}dts) overlapping between fragments detected`
  496. );
  497. }
  498. firstDTS = nextAvcDts;
  499. const firstPTS = inputSamples[0].pts - delta;
  500. inputSamples[0].dts = firstDTS;
  501. inputSamples[0].pts = firstPTS;
  502. logger.log(
  503. `Video: First PTS/DTS adjusted: ${toMsFromMpegTsClock(
  504. firstPTS,
  505. true
  506. )}/${toMsFromMpegTsClock(
  507. firstDTS,
  508. true
  509. )}, delta: ${toMsFromMpegTsClock(delta, true)} ms`
  510. );
  511. }
  512. }
  513.  
  514. if (requiresPositiveDts) {
  515. firstDTS = Math.max(0, firstDTS);
  516. }
  517. let nbNalu = 0;
  518. let naluLen = 0;
  519. for (let i = 0; i < nbSamples; i++) {
  520. // compute total/avc sample length and nb of NAL units
  521. const sample = inputSamples[i];
  522. const units = sample.units;
  523. const nbUnits = units.length;
  524. let sampleLen = 0;
  525. for (let j = 0; j < nbUnits; j++) {
  526. sampleLen += units[j].data.length;
  527. }
  528.  
  529. naluLen += sampleLen;
  530. nbNalu += nbUnits;
  531. sample.length = sampleLen;
  532.  
  533. // normalize PTS/DTS
  534. // ensure sample monotonic DTS
  535. sample.dts = Math.max(sample.dts, firstDTS);
  536. // ensure that computed value is greater or equal than sample DTS
  537. sample.pts = Math.max(sample.pts, sample.dts, 0);
  538. minPTS = Math.min(sample.pts, minPTS);
  539. maxPTS = Math.max(sample.pts, maxPTS);
  540. }
  541. lastDTS = inputSamples[nbSamples - 1].dts;
  542.  
  543. /* concatenate the video data and construct the mdat in place
  544. (need 8 more bytes to fill length and mpdat type) */
  545. const mdatSize = naluLen + 4 * nbNalu + 8;
  546. let mdat;
  547. try {
  548. mdat = new Uint8Array(mdatSize);
  549. } catch (err) {
  550. this.observer.emit(Events.ERROR, Events.ERROR, {
  551. type: ErrorTypes.MUX_ERROR,
  552. details: ErrorDetails.REMUX_ALLOC_ERROR,
  553. fatal: false,
  554. bytes: mdatSize,
  555. reason: `fail allocating video mdat ${mdatSize}`,
  556. });
  557. return;
  558. }
  559. const view = new DataView(mdat.buffer);
  560. view.setUint32(0, mdatSize);
  561. mdat.set(MP4.types.mdat, 4);
  562.  
  563. for (let i = 0; i < nbSamples; i++) {
  564. const avcSample = inputSamples[i];
  565. const avcSampleUnits = avcSample.units;
  566. let mp4SampleLength = 0;
  567. // convert NALU bitstream to MP4 format (prepend NALU with size field)
  568. for (let j = 0, nbUnits = avcSampleUnits.length; j < nbUnits; j++) {
  569. const unit = avcSampleUnits[j];
  570. const unitData = unit.data;
  571. const unitDataLen = unit.data.byteLength;
  572. view.setUint32(offset, unitDataLen);
  573. offset += 4;
  574. mdat.set(unitData, offset);
  575. offset += unitDataLen;
  576. mp4SampleLength += 4 + unitDataLen;
  577. }
  578.  
  579. // expected sample duration is the Decoding Timestamp diff of consecutive samples
  580. if (i < nbSamples - 1) {
  581. mp4SampleDuration = inputSamples[i + 1].dts - avcSample.dts;
  582. } else {
  583. const config = this.config;
  584. const lastFrameDuration =
  585. avcSample.dts - inputSamples[i > 0 ? i - 1 : i].dts;
  586. if (config.stretchShortVideoTrack && this.nextAudioPts !== null) {
  587. // In some cases, a segment's audio track duration may exceed the video track duration.
  588. // Since we've already remuxed audio, and we know how long the audio track is, we look to
  589. // see if the delta to the next segment is longer than maxBufferHole.
  590. // If so, playback would potentially get stuck, so we artificially inflate
  591. // the duration of the last frame to minimize any potential gap between segments.
  592. const gapTolerance = Math.floor(config.maxBufferHole * timeScale);
  593. const deltaToFrameEnd =
  594. (audioTrackLength
  595. ? minPTS + audioTrackLength * timeScale
  596. : this.nextAudioPts) - avcSample.pts;
  597. if (deltaToFrameEnd > gapTolerance) {
  598. // We subtract lastFrameDuration from deltaToFrameEnd to try to prevent any video
  599. // frame overlap. maxBufferHole should be >> lastFrameDuration anyway.
  600. mp4SampleDuration = deltaToFrameEnd - lastFrameDuration;
  601. if (mp4SampleDuration < 0) {
  602. mp4SampleDuration = lastFrameDuration;
  603. }
  604. logger.log(
  605. `[mp4-remuxer]: It is approximately ${
  606. deltaToFrameEnd / 90
  607. } ms to the next segment; using duration ${
  608. mp4SampleDuration / 90
  609. } ms for the last video frame.`
  610. );
  611. } else {
  612. mp4SampleDuration = lastFrameDuration;
  613. }
  614. } else {
  615. mp4SampleDuration = lastFrameDuration;
  616. }
  617. }
  618. const compositionTimeOffset = Math.round(avcSample.pts - avcSample.dts);
  619.  
  620. outputSamples.push(
  621. new Mp4Sample(
  622. avcSample.key,
  623. mp4SampleDuration,
  624. mp4SampleLength,
  625. compositionTimeOffset
  626. )
  627. );
  628. }
  629.  
  630. if (outputSamples.length && chromeVersion && chromeVersion < 70) {
  631. // Chrome workaround, mark first sample as being a Random Access Point (keyframe) to avoid sourcebuffer append issue
  632. // https://code.google.com/p/chromium/issues/detail?id=229412
  633. const flags = outputSamples[0].flags;
  634. flags.dependsOn = 2;
  635. flags.isNonSync = 0;
  636. }
  637.  
  638. console.assert(
  639. mp4SampleDuration !== undefined,
  640. 'mp4SampleDuration must be computed'
  641. );
  642. // next AVC sample DTS should be equal to last sample DTS + last sample duration (in PES timescale)
  643. this.nextAvcDts = nextAvcDts = lastDTS + mp4SampleDuration;
  644. this.isVideoContiguous = true;
  645. const moof = MP4.moof(
  646. track.sequenceNumber++,
  647. firstDTS,
  648. Object.assign({}, track, {
  649. samples: outputSamples,
  650. })
  651. );
  652. const type: SourceBufferName = 'video';
  653. const data = {
  654. data1: moof,
  655. data2: mdat,
  656. startPTS: minPTS / timeScale,
  657. endPTS: (maxPTS + mp4SampleDuration) / timeScale,
  658. startDTS: firstDTS / timeScale,
  659. endDTS: (nextAvcDts as number) / timeScale,
  660. type,
  661. hasAudio: false,
  662. hasVideo: true,
  663. nb: outputSamples.length,
  664. dropped: track.dropped,
  665. };
  666.  
  667. track.samples = [];
  668. track.dropped = 0;
  669.  
  670. console.assert(mdat.length, 'MDAT length must not be zero');
  671.  
  672. return data;
  673. }
  674.  
  675. remuxAudio(
  676. track: DemuxedAudioTrack,
  677. timeOffset: number,
  678. contiguous: boolean,
  679. accurateTimeOffset: boolean,
  680. videoTimeOffset?: number
  681. ): RemuxedTrack | undefined {
  682. const inputTimeScale: number = track.inputTimeScale;
  683. const mp4timeScale: number = track.samplerate
  684. ? track.samplerate
  685. : inputTimeScale;
  686. const scaleFactor: number = inputTimeScale / mp4timeScale;
  687. const mp4SampleDuration: number =
  688. track.segmentCodec === 'aac'
  689. ? AAC_SAMPLES_PER_FRAME
  690. : MPEG_AUDIO_SAMPLE_PER_FRAME;
  691. const inputSampleDuration: number = mp4SampleDuration * scaleFactor;
  692. const initPTS: number = this._initPTS;
  693. const rawMPEG: boolean =
  694. track.segmentCodec === 'mp3' && this.typeSupported.mpeg;
  695. const outputSamples: Array<Mp4Sample> = [];
  696.  
  697. let inputSamples: Array<AudioSample> = track.samples;
  698. let offset: number = rawMPEG ? 0 : 8;
  699. let nextAudioPts: number = this.nextAudioPts || -1;
  700.  
  701. // window.audioSamples ? window.audioSamples.push(inputSamples.map(s => s.pts)) : (window.audioSamples = [inputSamples.map(s => s.pts)]);
  702.  
  703. // for audio samples, also consider consecutive fragments as being contiguous (even if a level switch occurs),
  704. // for sake of clarity:
  705. // consecutive fragments are frags with
  706. // - less than 100ms gaps between new time offset (if accurate) and next expected PTS OR
  707. // - less than 20 audio frames distance
  708. // contiguous fragments are consecutive fragments from same quality level (same level, new SN = old SN + 1)
  709. // this helps ensuring audio continuity
  710. // and this also avoids audio glitches/cut when switching quality, or reporting wrong duration on first audio frame
  711. const timeOffsetMpegTS = timeOffset * inputTimeScale;
  712. this.isAudioContiguous = contiguous =
  713. contiguous ||
  714. ((inputSamples.length &&
  715. nextAudioPts > 0 &&
  716. ((accurateTimeOffset &&
  717. Math.abs(timeOffsetMpegTS - nextAudioPts) < 9000) ||
  718. Math.abs(
  719. normalizePts(inputSamples[0].pts - initPTS, timeOffsetMpegTS) -
  720. nextAudioPts
  721. ) <
  722. 20 * inputSampleDuration)) as boolean);
  723.  
  724. // compute normalized PTS
  725. inputSamples.forEach(function (sample) {
  726. sample.pts = normalizePts(sample.pts - initPTS, timeOffsetMpegTS);
  727. });
  728.  
  729. if (!contiguous || nextAudioPts < 0) {
  730. // filter out sample with negative PTS that are not playable anyway
  731. // if we don't remove these negative samples, they will shift all audio samples forward.
  732. // leading to audio overlap between current / next fragment
  733. inputSamples = inputSamples.filter((sample) => sample.pts >= 0);
  734.  
  735. // in case all samples have negative PTS, and have been filtered out, return now
  736. if (!inputSamples.length) {
  737. return;
  738. }
  739.  
  740. if (videoTimeOffset === 0) {
  741. // Set the start to 0 to match video so that start gaps larger than inputSampleDuration are filled with silence
  742. nextAudioPts = 0;
  743. } else if (accurateTimeOffset) {
  744. // When not seeking, not live, and LevelDetails.PTSKnown, use fragment start as predicted next audio PTS
  745. nextAudioPts = Math.max(0, timeOffsetMpegTS);
  746. } else {
  747. // if frags are not contiguous and if we cant trust time offset, let's use first sample PTS as next audio PTS
  748. nextAudioPts = inputSamples[0].pts;
  749. }
  750. }
  751.  
  752. // If the audio track is missing samples, the frames seem to get "left-shifted" within the
  753. // resulting mp4 segment, causing sync issues and leaving gaps at the end of the audio segment.
  754. // In an effort to prevent this from happening, we inject frames here where there are gaps.
  755. // When possible, we inject a silent frame; when that's not possible, we duplicate the last
  756. // frame.
  757.  
  758. if (track.segmentCodec === 'aac') {
  759. const alignedWithVideo = videoTimeOffset !== undefined;
  760. const maxAudioFramesDrift = this.config.maxAudioFramesDrift;
  761. for (let i = 0, nextPts = nextAudioPts; i < inputSamples.length; i++) {
  762. // First, let's see how far off this frame is from where we expect it to be
  763. const sample = inputSamples[i];
  764. const pts = sample.pts;
  765. const delta = pts - nextPts;
  766. const duration = Math.abs((1000 * delta) / inputTimeScale);
  767.  
  768. // When remuxing with video, if we're overlapping by more than a duration, drop this sample to stay in sync
  769. if (
  770. delta <= -maxAudioFramesDrift * inputSampleDuration &&
  771. alignedWithVideo
  772. ) {
  773. if (i === 0) {
  774. logger.warn(
  775. `Audio frame @ ${(pts / inputTimeScale).toFixed(
  776. 3
  777. )}s overlaps nextAudioPts by ${Math.round(
  778. (1000 * delta) / inputTimeScale
  779. )} ms.`
  780. );
  781. this.nextAudioPts = nextAudioPts = nextPts = pts;
  782. }
  783. } // eslint-disable-line brace-style
  784.  
  785. // Insert missing frames if:
  786. // 1: We're more than maxAudioFramesDrift frame away
  787. // 2: Not more than MAX_SILENT_FRAME_DURATION away
  788. // 3: currentTime (aka nextPtsNorm) is not 0
  789. // 4: remuxing with video (videoTimeOffset !== undefined)
  790. else if (
  791. delta >= maxAudioFramesDrift * inputSampleDuration &&
  792. duration < MAX_SILENT_FRAME_DURATION &&
  793. alignedWithVideo
  794. ) {
  795. let missing = Math.round(delta / inputSampleDuration);
  796. // Adjust nextPts so that silent samples are aligned with media pts. This will prevent media samples from
  797. // later being shifted if nextPts is based on timeOffset and delta is not a multiple of inputSampleDuration.
  798. nextPts = pts - missing * inputSampleDuration;
  799. if (nextPts < 0) {
  800. missing--;
  801. nextPts += inputSampleDuration;
  802. }
  803. if (i === 0) {
  804. this.nextAudioPts = nextAudioPts = nextPts;
  805. }
  806. logger.warn(
  807. `[mp4-remuxer]: Injecting ${missing} audio frame @ ${(
  808. nextPts / inputTimeScale
  809. ).toFixed(3)}s due to ${Math.round(
  810. (1000 * delta) / inputTimeScale
  811. )} ms gap.`
  812. );
  813. for (let j = 0; j < missing; j++) {
  814. const newStamp = Math.max(nextPts as number, 0);
  815. let fillFrame = AAC.getSilentFrame(
  816. track.manifestCodec || track.codec,
  817. track.channelCount
  818. );
  819. if (!fillFrame) {
  820. logger.log(
  821. '[mp4-remuxer]: Unable to get silent frame for given audio codec; duplicating last frame instead.'
  822. );
  823. fillFrame = sample.unit.subarray();
  824. }
  825. inputSamples.splice(i, 0, {
  826. unit: fillFrame,
  827. pts: newStamp,
  828. });
  829. nextPts += inputSampleDuration;
  830. i++;
  831. }
  832. }
  833. sample.pts = nextPts;
  834. nextPts += inputSampleDuration;
  835. }
  836. }
  837. let firstPTS: number | null = null;
  838. let lastPTS: number | null = null;
  839. let mdat: any;
  840. let mdatSize: number = 0;
  841. let sampleLength: number = inputSamples.length;
  842. while (sampleLength--) {
  843. mdatSize += inputSamples[sampleLength].unit.byteLength;
  844. }
  845. for (let j = 0, nbSamples = inputSamples.length; j < nbSamples; j++) {
  846. const audioSample = inputSamples[j];
  847. const unit = audioSample.unit;
  848. let pts = audioSample.pts;
  849. if (lastPTS !== null) {
  850. // If we have more than one sample, set the duration of the sample to the "real" duration; the PTS diff with
  851. // the previous sample
  852. const prevSample = outputSamples[j - 1];
  853. prevSample.duration = Math.round((pts - lastPTS) / scaleFactor);
  854. } else {
  855. if (contiguous && track.segmentCodec === 'aac') {
  856. // set PTS/DTS to expected PTS/DTS
  857. pts = nextAudioPts;
  858. }
  859. // remember first PTS of our audioSamples
  860. firstPTS = pts;
  861. if (mdatSize > 0) {
  862. /* concatenate the audio data and construct the mdat in place
  863. (need 8 more bytes to fill length and mdat type) */
  864. mdatSize += offset;
  865. try {
  866. mdat = new Uint8Array(mdatSize);
  867. } catch (err) {
  868. this.observer.emit(Events.ERROR, Events.ERROR, {
  869. type: ErrorTypes.MUX_ERROR,
  870. details: ErrorDetails.REMUX_ALLOC_ERROR,
  871. fatal: false,
  872. bytes: mdatSize,
  873. reason: `fail allocating audio mdat ${mdatSize}`,
  874. });
  875. return;
  876. }
  877. if (!rawMPEG) {
  878. const view = new DataView(mdat.buffer);
  879. view.setUint32(0, mdatSize);
  880. mdat.set(MP4.types.mdat, 4);
  881. }
  882. } else {
  883. // no audio samples
  884. return;
  885. }
  886. }
  887. mdat.set(unit, offset);
  888. const unitLen = unit.byteLength;
  889. offset += unitLen;
  890. // Default the sample's duration to the computed mp4SampleDuration, which will either be 1024 for AAC or 1152 for MPEG
  891. // In the case that we have 1 sample, this will be the duration. If we have more than one sample, the duration
  892. // becomes the PTS diff with the previous sample
  893. outputSamples.push(new Mp4Sample(true, mp4SampleDuration, unitLen, 0));
  894. lastPTS = pts;
  895. }
  896.  
  897. // We could end up with no audio samples if all input samples were overlapping with the previously remuxed ones
  898. const nbSamples = outputSamples.length;
  899. if (!nbSamples) {
  900. return;
  901. }
  902.  
  903. // The next audio sample PTS should be equal to last sample PTS + duration
  904. const lastSample = outputSamples[outputSamples.length - 1];
  905. this.nextAudioPts = nextAudioPts =
  906. lastPTS! + scaleFactor * lastSample.duration;
  907.  
  908. // Set the track samples from inputSamples to outputSamples before remuxing
  909. const moof = rawMPEG
  910. ? new Uint8Array(0)
  911. : MP4.moof(
  912. track.sequenceNumber++,
  913. firstPTS! / scaleFactor,
  914. Object.assign({}, track, { samples: outputSamples })
  915. );
  916.  
  917. // Clear the track samples. This also clears the samples array in the demuxer, since the reference is shared
  918. track.samples = [];
  919. const start = firstPTS! / inputTimeScale;
  920. const end = nextAudioPts / inputTimeScale;
  921. const type: SourceBufferName = 'audio';
  922. const audioData = {
  923. data1: moof,
  924. data2: mdat,
  925. startPTS: start,
  926. endPTS: end,
  927. startDTS: start,
  928. endDTS: end,
  929. type,
  930. hasAudio: true,
  931. hasVideo: false,
  932. nb: nbSamples,
  933. };
  934.  
  935. this.isAudioContiguous = true;
  936.  
  937. console.assert(mdat.length, 'MDAT length must not be zero');
  938. return audioData;
  939. }
  940.  
  941. remuxEmptyAudio(
  942. track: DemuxedAudioTrack,
  943. timeOffset: number,
  944. contiguous: boolean,
  945. videoData: Fragment
  946. ): RemuxedTrack | undefined {
  947. const inputTimeScale: number = track.inputTimeScale;
  948. const mp4timeScale: number = track.samplerate
  949. ? track.samplerate
  950. : inputTimeScale;
  951. const scaleFactor: number = inputTimeScale / mp4timeScale;
  952. const nextAudioPts: number | null = this.nextAudioPts;
  953. // sync with video's timestamp
  954. const startDTS: number =
  955. (nextAudioPts !== null
  956. ? nextAudioPts
  957. : videoData.startDTS * inputTimeScale) + this._initDTS;
  958. const endDTS: number = videoData.endDTS * inputTimeScale + this._initDTS;
  959. // one sample's duration value
  960. const frameDuration: number = scaleFactor * AAC_SAMPLES_PER_FRAME;
  961. // samples count of this segment's duration
  962. const nbSamples: number = Math.ceil((endDTS - startDTS) / frameDuration);
  963. // silent frame
  964. const silentFrame: Uint8Array | undefined = AAC.getSilentFrame(
  965. track.manifestCodec || track.codec,
  966. track.channelCount
  967. );
  968.  
  969. logger.warn('[mp4-remuxer]: remux empty Audio');
  970. // Can't remux if we can't generate a silent frame...
  971. if (!silentFrame) {
  972. logger.trace(
  973. '[mp4-remuxer]: Unable to remuxEmptyAudio since we were unable to get a silent frame for given audio codec'
  974. );
  975. return;
  976. }
  977.  
  978. const samples: Array<any> = [];
  979. for (let i = 0; i < nbSamples; i++) {
  980. const stamp = startDTS + i * frameDuration;
  981. samples.push({ unit: silentFrame, pts: stamp, dts: stamp });
  982. }
  983. track.samples = samples;
  984.  
  985. return this.remuxAudio(track, timeOffset, contiguous, false);
  986. }
  987. }
  988.  
  989. export function normalizePts(value: number, reference: number | null): number {
  990. let offset;
  991. if (reference === null) {
  992. return value;
  993. }
  994.  
  995. if (reference < value) {
  996. // - 2^33
  997. offset = -8589934592;
  998. } else {
  999. // + 2^33
  1000. offset = 8589934592;
  1001. }
  1002. /* PTS is 33bit (from 0 to 2^33 -1)
  1003. if diff between value and reference is bigger than half of the amplitude (2^32) then it means that
  1004. PTS looping occured. fill the gap */
  1005. while (Math.abs(value - reference) > 4294967296) {
  1006. value += offset;
  1007. }
  1008.  
  1009. return value;
  1010. }
  1011.  
  1012. function findKeyframeIndex(samples: Array<AvcSample>): number {
  1013. for (let i = 0; i < samples.length; i++) {
  1014. if (samples[i].key) {
  1015. return i;
  1016. }
  1017. }
  1018. return -1;
  1019. }
  1020.  
  1021. export function flushTextTrackMetadataCueSamples(
  1022. track: DemuxedMetadataTrack,
  1023. timeOffset: number,
  1024. initPTS: number,
  1025. initDTS: number
  1026. ): RemuxedMetadata | undefined {
  1027. const length = track.samples.length;
  1028. if (!length) {
  1029. return;
  1030. }
  1031. const inputTimeScale = track.inputTimeScale;
  1032. for (let index = 0; index < length; index++) {
  1033. const sample = track.samples[index];
  1034. // setting id3 pts, dts to relative time
  1035. // using this._initPTS and this._initDTS to calculate relative time
  1036. sample.pts =
  1037. normalizePts(sample.pts - initPTS, timeOffset * inputTimeScale) /
  1038. inputTimeScale;
  1039. sample.dts =
  1040. normalizePts(sample.dts - initDTS, timeOffset * inputTimeScale) /
  1041. inputTimeScale;
  1042. }
  1043. const samples = track.samples;
  1044. track.samples = [];
  1045. return {
  1046. samples,
  1047. };
  1048. }
  1049.  
  1050. export function flushTextTrackUserdataCueSamples(
  1051. track: DemuxedUserdataTrack,
  1052. timeOffset: number,
  1053. initPTS: number
  1054. ): RemuxedUserdata | undefined {
  1055. const length = track.samples.length;
  1056. if (!length) {
  1057. return;
  1058. }
  1059.  
  1060. const inputTimeScale = track.inputTimeScale;
  1061. for (let index = 0; index < length; index++) {
  1062. const sample = track.samples[index];
  1063. // setting text pts, dts to relative time
  1064. // using this._initPTS and this._initDTS to calculate relative time
  1065. sample.pts =
  1066. normalizePts(sample.pts - initPTS, timeOffset * inputTimeScale) /
  1067. inputTimeScale;
  1068. }
  1069. track.samples.sort((a, b) => a.pts - b.pts);
  1070. const samples = track.samples;
  1071. track.samples = [];
  1072. return {
  1073. samples,
  1074. };
  1075. }
  1076.  
  1077. class Mp4Sample {
  1078. public size: number;
  1079. public duration: number;
  1080. public cts: number;
  1081. public flags: Mp4SampleFlags;
  1082.  
  1083. constructor(isKeyframe: boolean, duration, size, cts) {
  1084. this.duration = duration;
  1085. this.size = size;
  1086. this.cts = cts;
  1087. this.flags = new Mp4SampleFlags(isKeyframe);
  1088. }
  1089. }
  1090.  
  1091. class Mp4SampleFlags {
  1092. public isLeading: 0 = 0;
  1093. public isDependedOn: 0 = 0;
  1094. public hasRedundancy: 0 = 0;
  1095. public degradPrio: 0 = 0;
  1096. public dependsOn: 1 | 2 = 1;
  1097. public isNonSync: 0 | 1 = 1;
  1098.  
  1099. constructor(isKeyframe) {
  1100. this.dependsOn = isKeyframe ? 2 : 1;
  1101. this.isNonSync = isKeyframe ? 0 : 1;
  1102. }
  1103. }