// Reference: https://learn.microsoft.com/ja-jp/windows/win32/medfound/tutorial--encoding-an-mp4-file-

#include <new>
#include <iostream>
#include <memory>
#include <cstdio>
#include <cstdint>
#include <cstdlib>

#include <windows.h>
#include <mfapi.h>
#include <Mfidl.h>
#include <shlwapi.h>
#include <codecapi.h>

template <class T> void SafeRelease(T *&pT) {
	if (pT) {
		pT->Release();
		pT = NULL;
	}
}

ULONG GetRefCount(IUnknown *unk) {
	if (!unk) return 0;
	ULONG cnt = unk->AddRef();
	unk->Release();
	return cnt - 1;
}

struct H264ProfileInfo {
	UINT32  profile;
	MFRatio fps;
	MFRatio frame_size;
	UINT32  bitrate;
};

struct AACProfileInfo {
	UINT32  samplesPerSec;
	UINT32  numChannels;
	UINT32  bitsPerSample;
	UINT32  bytesPerSec;
	UINT32  aacProfile;
};

AACProfileInfo aac_profiles[] = {
	{ 96000, 2, 16, 24000, 0x29 }, { 48000, 2, 16, 24000, 0x29 }, { 44100, 2, 16, 16000, 0x29 }, { 44100, 2, 16, 12000, 0x29 },
};

HRESULT CreateMediaSource(PCWSTR pszURL, IMFMediaSource **ppSource) {
	HRESULT hr;

	MF_OBJECT_TYPE ObjectType = MF_OBJECT_INVALID;
	IMFSourceResolver* pResolver = NULL;
	IUnknown* pSource = NULL;

	// Create the source resolver.
	if (FAILED(hr = ::MFCreateSourceResolver(&pResolver))) goto END;

	// Use the source resolver to create the media source
	if (FAILED(hr = pResolver->CreateObjectFromURL(pszURL, MF_RESOLUTION_MEDIASOURCE, NULL, &ObjectType, &pSource))) goto END;

	// Get the IMFMediaSource interface from the media source.
	hr = pSource->QueryInterface(IID_PPV_ARGS(ppSource));

END:
	SafeRelease(pResolver);
	SafeRelease(pSource);
	return hr;
}

HRESULT CreateAggregatedSource(IMFMediaSource *pSource1, IMFMediaSource *pSource2, IMFMediaSource **ppAggSource) {
	HRESULT hr;

	*ppAggSource = NULL;
	IMFCollection *pCollection = NULL;

	if (FAILED(hr = ::MFCreateCollection(&pCollection))) goto END;
	if (FAILED(hr = pCollection->AddElement(pSource1))) goto END;
	if (FAILED(hr = pCollection->AddElement(pSource2))) goto END;
	hr = ::MFCreateAggregateSource(pCollection, ppAggSource);

END:
	SafeRelease(pCollection);
	return hr;
}

HRESULT GetSourceDescription(IMFMediaSource *pSource, MFTIME *pDuration, MFRatio *fps, MFRatio *framesize) {
	HRESULT hr;

	*pDuration = 0;
	IMFPresentationDescriptor *pPD = NULL;

	if (FAILED(hr = pSource->CreatePresentationDescriptor(&pPD))) goto END;
	if (pDuration) if (FAILED(hr = pPD->GetUINT64(MF_PD_DURATION, reinterpret_cast<UINT64*>(pDuration)))) goto END;

	DWORD sdCnt;
	if (FAILED(hr = pPD->GetStreamDescriptorCount(&sdCnt))) goto END;
	for (DWORD j = 0; j < sdCnt; ++j) {
		BOOL sel;
		IMFStreamDescriptor *pSD = NULL;
		IMFMediaTypeHandler *pMTH = NULL;
		if (FAILED(hr = pPD->GetStreamDescriptorByIndex(j, &sel, &pSD))) goto LOOPJ_END;
		if (FAILED(hr = pSD->GetMediaTypeHandler(&pMTH))) goto LOOPJ_END;

		GUID majorType;
		if (FAILED(hr = pMTH->GetMajorType(&majorType))) goto LOOPJ_END;
		if (majorType != MFMediaType_Audio && majorType != MFMediaType_Video) goto LOOPJ_END;

		DWORD mtCnt;
		pMTH->GetMediaTypeCount(&mtCnt);
		for (DWORD i = 0; i < mtCnt; ++i) {
			IMFMediaType *pMediaType = NULL;
			if (FAILED(hr = pMTH->GetMediaTypeByIndex(i, &pMediaType))) goto LOOPI_END;
			if (majorType == MFMediaType_Audio) {
			} else if (majorType == MFMediaType_Video) {
				if (fps) if (FAILED(hr = ::MFGetAttributeSize(pMediaType, MF_MT_FRAME_RATE, reinterpret_cast<UINT32*>(&fps->Numerator), reinterpret_cast<UINT32*>(&fps->Denominator)))) goto LOOPI_END;
				if (framesize) if (FAILED(hr = ::MFGetAttributeSize(pMediaType, MF_MT_FRAME_SIZE, reinterpret_cast<UINT32*>(&framesize->Numerator), reinterpret_cast<UINT32*>(&framesize->Denominator)))) goto LOOPI_END;
			}
LOOPI_END:
			SafeRelease(pMediaType);
		}

LOOPJ_END:
		SafeRelease(pSD);
		SafeRelease(pMTH);
	}

END:
	SafeRelease(pPD);
	return hr;
}

HRESULT ChangeSourceDuration(IMFMediaSource *pSource, MFTIME duration) {
	HRESULT hr;

	IMFPresentationDescriptor *pPD = NULL;

	if (FAILED(hr = pSource->CreatePresentationDescriptor(&pPD))) goto END;
	if (FAILED(hr = pPD->SetUINT64(MF_PD_DURATION, duration))) goto END;

END:
	SafeRelease(pPD);
	return hr;
}

HRESULT CreateH264Profile(const H264ProfileInfo &h264_profile, IMFAttributes **ppAttributes) {
	HRESULT hr;

	IMFAttributes *pAttributes = NULL;

	if (FAILED(hr = ::MFCreateAttributes(&pAttributes, 5))) goto END;
	if (FAILED(hr = pAttributes->SetGUID(MF_MT_SUBTYPE, MFVideoFormat_H264))) goto END;
	if (FAILED(hr = pAttributes->SetUINT32(MF_MT_MPEG2_PROFILE,          h264_profile.profile))) goto END;
	if (FAILED(hr = ::MFSetAttributeSize(pAttributes, MF_MT_FRAME_SIZE,  h264_profile.frame_size.Numerator, h264_profile.frame_size.Denominator))) goto END;
	if (FAILED(hr = ::MFSetAttributeRatio(pAttributes, MF_MT_FRAME_RATE, h264_profile.fps.Numerator, h264_profile.fps.Denominator))) goto END;
	if (FAILED(hr = pAttributes->SetUINT32(MF_MT_AVG_BITRATE, h264_profile.bitrate))) goto END;
	*ppAttributes = pAttributes;
	(*ppAttributes)->AddRef();

END:
	SafeRelease(pAttributes);
	return hr;
}

HRESULT CreateAACProfile(DWORD index, IMFAttributes **ppAttributes) {
	HRESULT hr;

	if (index >= ARRAYSIZE(aac_profiles)) return E_INVALIDARG;

	const AACProfileInfo& profile = aac_profiles[index];
	IMFAttributes *pAttributes = NULL;

	if (FAILED(hr = MFCreateAttributes(&pAttributes, 7))) goto END;
	if (FAILED(hr = pAttributes->SetGUID(MF_MT_SUBTYPE, MFAudioFormat_AAC))) goto END;
	if (FAILED(hr = pAttributes->SetUINT32(MF_MT_AUDIO_BITS_PER_SAMPLE,      profile.bitsPerSample))) goto END;
	if (FAILED(hr = pAttributes->SetUINT32(MF_MT_AUDIO_SAMPLES_PER_SECOND,   profile.samplesPerSec))) goto END;
	if (FAILED(hr = pAttributes->SetUINT32(MF_MT_AUDIO_NUM_CHANNELS,         profile.numChannels))) goto END;
	if (FAILED(hr = pAttributes->SetUINT32(MF_MT_AUDIO_AVG_BYTES_PER_SECOND, profile.bytesPerSec))) goto END;
	if (FAILED(hr = pAttributes->SetUINT32(MF_MT_AUDIO_BLOCK_ALIGNMENT, 1))) goto END;
	if (FAILED(hr = pAttributes->SetUINT32(MF_MT_AAC_AUDIO_PROFILE_LEVEL_INDICATION, profile.aacProfile))) goto END;

	*ppAttributes = pAttributes;
	(*ppAttributes)->AddRef();

END:
	SafeRelease(pAttributes);
	return hr;
}

HRESULT CreateTranscodeProfile(int audio_profile, H264ProfileInfo &video_profile, IMFTranscodeProfile **ppProfile) {
	HRESULT hr;

	IMFTranscodeProfile *pProfile = NULL;
	IMFAttributes *pAudio = NULL;
	IMFAttributes *pVideo = NULL;
	IMFAttributes *pContainer = NULL;

	if (FAILED(hr = ::MFCreateTranscodeProfile(&pProfile))) goto END;

	// Video attributes.
	if (FAILED(hr = CreateH264Profile(video_profile, &pVideo))) goto END;
	if (FAILED(hr = pProfile->SetVideoAttributes(pVideo))) goto END;

	// Audio attributes.
	if (FAILED(hr = CreateAACProfile(audio_profile, &pAudio))) goto END;
	if (FAILED(hr = pProfile->SetAudioAttributes(pAudio))) goto END;

	// Container attributes.
	if (FAILED(hr = ::MFCreateAttributes(&pContainer, 1))) goto END;
	if (FAILED(hr = pContainer->SetGUID(MF_TRANSCODE_CONTAINERTYPE, MFTranscodeContainerType_MPEG4))) goto END;
	if (FAILED(hr = pProfile->SetContainerAttributes(pContainer))) goto END;

	*ppProfile = pProfile;
	(*ppProfile)->AddRef();

END:
	SafeRelease(pProfile);
	SafeRelease(pAudio);
	SafeRelease(pVideo);
	SafeRelease(pContainer);
	return hr;
}

class CSession : public IMFAsyncCallback {
public:
	static HRESULT Create(CSession **ppSession) {
		HRESULT hr;

		*ppSession = NULL;
		CSession *pSession = 0;

		if (!(pSession = new (std::nothrow) CSession())) return E_OUTOFMEMORY;
		if (FAILED(hr = pSession->Initialize())) goto END;

		pSession->AddRef();
		*ppSession = pSession;
	END:
		SafeRelease(pSession);
		return hr;
	}

	// IUnknown methods
	STDMETHODIMP QueryInterface(REFIID riid, void** ppv) {
		static const QITAB qit[] = {
			QITABENT(CSession, IMFAsyncCallback),
			{ 0 }
		};
		return QISearch(this, qit, riid, ppv);
	}
	STDMETHODIMP_(ULONG) AddRef() {
		return ::InterlockedIncrement(&m_cRef);
	}
	STDMETHODIMP_(ULONG) Release() {
		long cRef = ::InterlockedDecrement(&m_cRef);
		if (cRef == 0) delete this;
		return cRef;
	}

	// IMFAsyncCallback methods
	STDMETHODIMP GetParameters(DWORD* pdwFlags, DWORD* pdwQueue) {
		// Implementation of this method is optional.
		return E_NOTIMPL;
	}
	STDMETHODIMP Invoke(IMFAsyncResult *pResult) {
		HRESULT hr;
		HRESULT hrStatus = S_OK;

		IMFMediaEvent* pEvent = NULL;
		MediaEventType meType = MEUnknown;

		if (FAILED(hr = m_pSession->EndGetEvent(pResult, &pEvent))) goto END;
		if (FAILED(hr = pEvent->GetType(&meType))) goto END;
		if (FAILED(hr = pEvent->GetStatus(&hrStatus))) goto END;
		if (FAILED(hrStatus)) {
			hr = hrStatus;
			goto END;
		}

		switch (meType) {
			case MESessionEnded:
				if (FAILED(hr = m_pSession->Close())) goto END;
				break;

			case MESessionClosed:
				::SetEvent(m_hWaitEvent);
				break;
		}

		if (meType != MESessionClosed) {
			hr = m_pSession->BeginGetEvent(this, NULL);
		}

END:
		if (FAILED(hr)) {
			m_hrStatus = hr;
			m_pSession->Close();
		}

		SafeRelease(pEvent);
		return hr;
	}

	// Other methods
	HRESULT StartEncodingSession(IMFTopology *pTopology) {
		HRESULT hr;

		if (SUCCEEDED(hr = m_pSession->SetTopology(0, pTopology))) {
			PROPVARIANT varStart;
			::PropVariantClear(&varStart);
			hr = m_pSession->Start(&GUID_NULL, &varStart);
		}
		return hr;
	}
	HRESULT GetEncodingPosition(MFTIME *pTime) {
		return m_pClock->GetTime(pTime);
	}
	HRESULT Wait(DWORD dwMsec) {
		HRESULT hr = S_OK;

		DWORD dwTimeoutStatus = ::WaitForSingleObject(m_hWaitEvent, dwMsec);
		if (dwTimeoutStatus != WAIT_OBJECT_0) {
			hr = E_PENDING;
		} else {
			hr = m_hrStatus;
		}
		return hr;
	}

private:
	CSession() : m_cRef(1), m_pSession(NULL), m_pClock(NULL), m_hrStatus(S_OK), m_hWaitEvent(NULL) {}
	virtual ~CSession() {
		if (m_pSession) m_pSession->Shutdown();

		SafeRelease(m_pClock);
		SafeRelease(m_pSession);
		::CloseHandle(m_hWaitEvent);
	}

	HRESULT Initialize() {
		HRESULT hr;

		IMFClock *pClock = NULL;

		if (FAILED(hr = MFCreateMediaSession(NULL, &m_pSession))) goto END;
		if (FAILED(hr = m_pSession->GetClock(&pClock))) goto END;
		if (FAILED(hr = pClock->QueryInterface(IID_PPV_ARGS(&m_pClock)))) goto END;
		if (FAILED(hr = m_pSession->BeginGetEvent(this, NULL))) goto END;
		if (!(m_hWaitEvent = ::CreateEvent(NULL, FALSE, FALSE, NULL))) {
			hr = HRESULT_FROM_WIN32(::GetLastError());
		}
END:
		SafeRelease(pClock);
		return hr;
	}

private:
	IMFMediaSession *m_pSession;
	IMFPresentationClock *m_pClock;
	HRESULT m_hrStatus;
	HANDLE  m_hWaitEvent;
	long    m_cRef;
};

HRESULT RunEncodingSession(CSession *pSession, MFTIME duration) {
	const DWORD WAIT_PERIOD = 500;
	const int UPDATE_INCR = 5;

	HRESULT hr = S_OK;
	MFTIME pos;
	LONGLONG prev = 0;
	while (1) {
		hr = pSession->Wait(WAIT_PERIOD);
		if (hr == E_PENDING) {
			hr = pSession->GetEncodingPosition(&pos);

			LONGLONG percent = (100 * pos) / duration;
			if (percent >= prev + UPDATE_INCR) {
				std::printf("%lld %%\n", percent);
				prev = percent;
			}
		} else {
			break;
		}
	}
	return hr;
}
HRESULT EncodeFile(int audio_profile, double video_fps, int video_bitrate, PCWSTR pszInput, PCWSTR pszOutput) {
	IMFTranscodeProfile *pProfile = NULL;
	IMFMediaSource *pSource = NULL;
	IMFTopology *pTopology = NULL;
	CSession *pSession = NULL;

	MFTIME duration;
	MFRatio fps = { 0 }, framesize = { 0 };
	HRESULT hr;

	if (FAILED(hr = CreateMediaSource(pszInput, &pSource))) goto END;
	if (FAILED(hr = GetSourceDescription(pSource, &duration, &fps, &framesize))) goto END;

	H264ProfileInfo h264_profile;
	h264_profile.profile = eAVEncH264VProfile_Main;
	if (video_fps == 0) {
		h264_profile.fps = fps;
	} else {
		h264_profile.fps.Numerator = static_cast<int>(video_fps * 10000.0);
		h264_profile.fps.Denominator = 10000;
	}
	h264_profile.frame_size = framesize;
	h264_profile.bitrate = video_bitrate == 0 ? 100000 : video_bitrate;

	if (FAILED(hr = CreateTranscodeProfile(audio_profile, h264_profile, &pProfile))) goto END;
	if (FAILED(hr = ::MFCreateTranscodeTopology(pSource, pszOutput, pProfile, &pTopology))) goto END;
	if (FAILED(hr = CSession::Create(&pSession))) goto END;
	if (FAILED(hr = pSession->StartEncodingSession(pTopology))) goto END;

	hr = RunEncodingSession(pSession, duration);

END:
	if (pSource) pSource->Shutdown();

	SafeRelease(pSession);
	SafeRelease(pProfile);
	SafeRelease(pSource);
	SafeRelease(pTopology);
	return hr;
}

int wmain(int argc, wchar_t * argv[]) {
	uint64_t flg;

	double video_fps = 0;
	int video_bitrate = 0;

	int audio_profile = 0;

	::HeapSetInformation(NULL, HeapEnableTerminationOnCorruption, NULL, 0);

	if (argc < 3 || argc > 6) {
		std::printf("Usage:\n");
		std::printf("input output [ audio_profile video_fps video_bitrate ]\n");
		return 1;
	}

	if (argc > 3) {
		audio_profile = std::wcstol(argv[3], 0, 10);
	}
	if (argc > 5) {
		video_fps = std::wcstod(argv[4], 0);
		video_bitrate = std::wcstol(argv[5], 0, 10);
	}

	if (FAILED(::CoInitializeEx(NULL, COINIT_APARTMENTTHREADED))) goto END;
	if (FAILED(::MFStartup(MF_VERSION))) goto END;
	if (FAILED(EncodeFile(audio_profile, video_fps, video_bitrate, argv[1], argv[2]))) goto END;

	std::printf("Done.\n");

	return 0;
END:
	::MFShutdown();
	::CoUninitialize();
	return 0;
}
