/*
 * Decompiled with CFR 0.152.
 */
package jdk.graal.compiler.lir.amd64;

import java.util.Arrays;
import java.util.EnumSet;
import jdk.graal.compiler.asm.Label;
import jdk.graal.compiler.asm.amd64.AMD64Address;
import jdk.graal.compiler.asm.amd64.AMD64Assembler;
import jdk.graal.compiler.asm.amd64.AMD64MacroAssembler;
import jdk.graal.compiler.asm.amd64.AVXKind;
import jdk.graal.compiler.core.common.LIRKind;
import jdk.graal.compiler.core.common.Stride;
import jdk.graal.compiler.core.common.StrideUtil;
import jdk.graal.compiler.debug.GraalError;
import jdk.graal.compiler.lir.LIRInstruction;
import jdk.graal.compiler.lir.LIRInstructionClass;
import jdk.graal.compiler.lir.Opcode;
import jdk.graal.compiler.lir.amd64.AMD64ComplexVectorOp;
import jdk.graal.compiler.lir.amd64.AMD64ControlFlow;
import jdk.graal.compiler.lir.asm.CompilationResultBuilder;
import jdk.graal.compiler.lir.gen.LIRGeneratorTool;
import jdk.vm.ci.amd64.AMD64;
import jdk.vm.ci.code.Register;
import jdk.vm.ci.code.RegisterValue;
import jdk.vm.ci.code.ValueUtil;
import jdk.vm.ci.meta.AllocatableValue;
import jdk.vm.ci.meta.JavaKind;
import jdk.vm.ci.meta.PlatformKind;
import jdk.vm.ci.meta.Value;

@Opcode(value="ARRAY_COPY_WITH_CONVERSIONS")
public final class AMD64ArrayCopyWithConversionsOp
extends AMD64ComplexVectorOp {
    public static final LIRInstructionClass<AMD64ArrayCopyWithConversionsOp> TYPE = LIRInstructionClass.create(AMD64ArrayCopyWithConversionsOp.class);
    private static final Register REG_ARRAY_SRC = AMD64.rsi;
    private static final Register REG_OFFSET_SRC = AMD64.rax;
    private static final Register REG_ARRAY_DST = AMD64.rdi;
    private static final Register REG_OFFSET_DST = AMD64.rcx;
    private static final Register REG_LENGTH = AMD64.rdx;
    private static final Register REG_STRIDE = AMD64.r8;
    private final Stride strideSrcConst;
    private final Stride strideDstConst;
    private final AMD64MacroAssembler.ExtendMode extendMode;
    @LIRInstruction.Use(value={LIRInstruction.OperandFlag.REG})
    private Value arraySrc;
    @LIRInstruction.Use(value={LIRInstruction.OperandFlag.REG})
    private Value offsetSrc;
    @LIRInstruction.Use(value={LIRInstruction.OperandFlag.REG})
    private Value arrayDst;
    @LIRInstruction.Use(value={LIRInstruction.OperandFlag.REG})
    private Value offsetDst;
    @LIRInstruction.Use(value={LIRInstruction.OperandFlag.REG})
    private Value length;
    @LIRInstruction.Use(value={LIRInstruction.OperandFlag.REG, LIRInstruction.OperandFlag.ILLEGAL})
    private Value dynamicStrides;
    @LIRInstruction.Temp(value={LIRInstruction.OperandFlag.REG})
    private Value arraySrcTmp;
    @LIRInstruction.Temp(value={LIRInstruction.OperandFlag.REG})
    private Value offsetSrcTmp;
    @LIRInstruction.Temp(value={LIRInstruction.OperandFlag.REG})
    private Value arrayDstTmp;
    @LIRInstruction.Temp(value={LIRInstruction.OperandFlag.REG})
    private Value offsetDstTmp;
    @LIRInstruction.Temp(value={LIRInstruction.OperandFlag.REG})
    private Value lengthTmp;
    @LIRInstruction.Temp(value={LIRInstruction.OperandFlag.REG, LIRInstruction.OperandFlag.ILLEGAL})
    private Value dynamicStridesTmp;
    @LIRInstruction.Temp(value={LIRInstruction.OperandFlag.REG})
    private Value[] vectorTemp;

    private AMD64ArrayCopyWithConversionsOp(LIRGeneratorTool tool, Stride strideSrc, Stride strideDst, EnumSet<AMD64.CPUFeature> runtimeCheckedCPUFeatures, Value arraySrc, Value offsetSrc, Value arrayDst, Value offsetDst, Value length, Value dynamicStrides, AMD64MacroAssembler.ExtendMode extendMode) {
        super(TYPE, tool, runtimeCheckedCPUFeatures, AVXKind.AVXSize.YMM);
        this.extendMode = extendMode;
        GraalError.guarantee(AMD64ArrayCopyWithConversionsOp.supports(tool.target(), runtimeCheckedCPUFeatures, AMD64.CPUFeature.SSE2, new AMD64.CPUFeature[0]), "needs at least SSE2 support");
        this.arraySrcTmp = this.arraySrc = arraySrc;
        this.offsetSrcTmp = this.offsetSrc = offsetSrc;
        this.arrayDstTmp = this.arrayDst = arrayDst;
        this.offsetDstTmp = this.offsetDst = offsetDst;
        this.lengthTmp = this.length = length;
        this.dynamicStridesTmp = this.dynamicStrides = dynamicStrides;
        if (StrideUtil.useConstantStrides(dynamicStrides)) {
            this.strideSrcConst = strideSrc;
            this.strideDstConst = strideDst;
            this.vectorTemp = new Value[AMD64ArrayCopyWithConversionsOp.getNumberOfRequiredVectorRegisters(AMD64ArrayCopyWithConversionsOp.getOp(this.strideDstConst, this.strideSrcConst))];
        } else {
            this.strideSrcConst = null;
            this.strideDstConst = null;
            this.vectorTemp = new Value[5];
        }
        for (int i = 0; i < this.vectorTemp.length; ++i) {
            this.vectorTemp[i] = tool.newVariable(LIRKind.value((PlatformKind)this.getVectorKind(JavaKind.Byte)));
        }
    }

    public static AMD64ArrayCopyWithConversionsOp movParamsAndCreate(LIRGeneratorTool tool, Stride strideSrc, Stride strideDst, EnumSet<AMD64.CPUFeature> runtimeCheckedCPUFeatures, Value arraySrc, Value offsetSrc, Value arrayDst, Value offsetDst, Value length, AMD64MacroAssembler.ExtendMode extendMode) {
        return AMD64ArrayCopyWithConversionsOp.movParamsAndCreate(tool, strideSrc, strideDst, runtimeCheckedCPUFeatures, arraySrc, offsetSrc, arrayDst, offsetDst, length, (Value)Value.ILLEGAL, extendMode);
    }

    public static AMD64ArrayCopyWithConversionsOp movParamsAndCreate(LIRGeneratorTool tool, EnumSet<AMD64.CPUFeature> runtimeCheckedCPUFeatures, Value arraySrc, Value offsetSrc, Value arrayDst, Value offsetDst, Value length, Value stride, AMD64MacroAssembler.ExtendMode extendMode) {
        return AMD64ArrayCopyWithConversionsOp.movParamsAndCreate(tool, null, null, runtimeCheckedCPUFeatures, arraySrc, offsetSrc, arrayDst, offsetDst, length, stride, extendMode);
    }

    private static AMD64ArrayCopyWithConversionsOp movParamsAndCreate(LIRGeneratorTool tool, Stride strideSrc, Stride strideDst, EnumSet<AMD64.CPUFeature> runtimeCheckedCPUFeatures, Value arraySrc, Value offsetSrc, Value arrayDst, Value offsetDst, Value length, Value dynamicStrides, AMD64MacroAssembler.ExtendMode extendMode) {
        AllocatableValue regDynamicStrides;
        RegisterValue regArraySrc = REG_ARRAY_SRC.asValue(arraySrc.getValueKind());
        RegisterValue regOffsetSrc = REG_OFFSET_SRC.asValue(offsetSrc.getValueKind());
        RegisterValue regArrayDst = REG_ARRAY_DST.asValue(arrayDst.getValueKind());
        RegisterValue regOffsetDst = REG_OFFSET_DST.asValue(offsetDst.getValueKind());
        RegisterValue regLength = REG_LENGTH.asValue(length.getValueKind());
        tool.emitConvertNullToZero((AllocatableValue)regArraySrc, arraySrc);
        tool.emitMove((AllocatableValue)regOffsetSrc, offsetSrc);
        tool.emitConvertNullToZero((AllocatableValue)regArrayDst, arrayDst);
        tool.emitMove((AllocatableValue)regOffsetDst, offsetDst);
        tool.emitMove((AllocatableValue)regLength, length);
        if (ValueUtil.isIllegal((Value)dynamicStrides)) {
            regDynamicStrides = Value.ILLEGAL;
        } else {
            regDynamicStrides = REG_STRIDE.asValue(dynamicStrides.getValueKind());
            tool.emitMove((AllocatableValue)((RegisterValue)regDynamicStrides), dynamicStrides);
        }
        return new AMD64ArrayCopyWithConversionsOp(tool, strideSrc, strideDst, runtimeCheckedCPUFeatures, (Value)regArraySrc, (Value)regOffsetSrc, (Value)regArrayDst, (Value)regOffsetDst, (Value)regLength, (Value)regDynamicStrides, extendMode);
    }

    private static Op getOp(Stride strideDst, Stride strideSrc) {
        if (strideDst.value == strideSrc.value) {
            return Op.copy;
        }
        if (strideDst.value < strideSrc.value) {
            switch (strideSrc) {
                case S2: {
                    assert (strideDst == Stride.S1) : strideDst;
                    return Op.compressCharToByte;
                }
                case S4: {
                    switch (strideDst) {
                        case S1: {
                            return Op.compressIntToByte;
                        }
                        case S2: {
                            return Op.compressIntToChar;
                        }
                    }
                    throw new UnsupportedOperationException();
                }
            }
            throw new UnsupportedOperationException();
        }
        switch (strideSrc) {
            case S1: {
                switch (strideDst) {
                    case S2: {
                        return Op.inflateByteToChar;
                    }
                    case S4: {
                        return Op.inflateByteToInt;
                    }
                }
                throw new UnsupportedOperationException();
            }
            case S2: {
                assert (strideDst == Stride.S4) : strideDst;
                return Op.inflateCharToInt;
            }
        }
        throw new UnsupportedOperationException();
    }

    private static int getNumberOfRequiredVectorRegisters(Op op) {
        switch (op.ordinal()) {
            case 0: 
            case 1: {
                return 2;
            }
            case 2: {
                return 5;
            }
        }
        return 1;
    }

    @Override
    public void emitCode(CompilationResultBuilder crb, AMD64MacroAssembler asm) {
        Register src = ValueUtil.asRegister((Value)this.arraySrc);
        Register sro = ValueUtil.asRegister((Value)this.offsetSrc);
        Register dst = ValueUtil.asRegister((Value)this.arrayDst);
        Register dso = ValueUtil.asRegister((Value)this.offsetDst);
        Register len = ValueUtil.asRegister((Value)this.length);
        asm.leaq(src, new AMD64Address(src, sro, Stride.S1));
        asm.leaq(dst, new AMD64Address(dst, dso, Stride.S1));
        if (ValueUtil.isIllegal((Value)this.dynamicStrides)) {
            this.emitOp(crb, asm, this.strideSrcConst, this.strideDstConst, src, sro, dst, len);
        } else {
            Label[] variants = new Label[9];
            Label end = new Label();
            for (int i = 0; i < variants.length; ++i) {
                variants[i] = new Label();
            }
            AMD64ControlFlow.RangeTableSwitchOp.emitJumpTable(crb, asm, dso, ValueUtil.asRegister((Value)this.dynamicStrides), 0, 8, Arrays.stream(variants));
            asm.align(this.preferredBranchTargetAlignment(crb));
            asm.bind(variants[StrideUtil.getDirectStubCallIndex(Stride.S4, Stride.S4)]);
            asm.maybeEmitIndirectTargetMarker();
            asm.shll(len, 1);
            asm.align(this.preferredBranchTargetAlignment(crb));
            asm.bind(variants[StrideUtil.getDirectStubCallIndex(Stride.S2, Stride.S2)]);
            asm.maybeEmitIndirectTargetMarker();
            asm.shll(len, 1);
            asm.align(this.preferredBranchTargetAlignment(crb));
            asm.bind(variants[StrideUtil.getDirectStubCallIndex(Stride.S1, Stride.S1)]);
            asm.maybeEmitIndirectTargetMarker();
            this.emitOp(crb, asm, Stride.S1, Stride.S1, src, sro, dst, len);
            asm.jmp(end);
            for (Stride strideSrc : new Stride[]{Stride.S1, Stride.S2, Stride.S4}) {
                for (Stride strideDst : new Stride[]{Stride.S1, Stride.S2, Stride.S4}) {
                    if (strideSrc == strideDst) continue;
                    asm.align(this.preferredBranchTargetAlignment(crb));
                    asm.bind(variants[StrideUtil.getDirectStubCallIndex(strideSrc, strideDst)]);
                    asm.maybeEmitIndirectTargetMarker();
                    this.emitOp(crb, asm, strideSrc, strideDst, src, sro, dst, len);
                    asm.jmp(end);
                }
            }
            asm.bind(end);
        }
    }

    private void emitOp(CompilationResultBuilder crb, AMD64MacroAssembler asm, Stride strideSrc, Stride strideDst, Register src, Register sro, Register dst, Register len) {
        if (strideSrc.value < strideDst.value) {
            this.emitInflate(crb, asm, strideSrc, strideDst, src, dst, len, sro);
        } else if (strideSrc.value == strideDst.value) {
            this.emitCopy(crb, asm, strideSrc, strideDst, src, dst, len, sro);
        } else {
            this.emitCompress(crb, asm, strideSrc, strideDst, src, dst, len, sro);
        }
    }

    private void emitCompress(CompilationResultBuilder crb, AMD64MacroAssembler asm, Stride strideSrc, Stride strideDst, Register src, Register dst, Register len, Register tmp) {
        Op op = AMD64ArrayCopyWithConversionsOp.getOp(strideDst, strideSrc);
        Label labelScalarLoop = new Label();
        Label labelDone = new Label();
        if (asm.supports(AMD64.CPUFeature.SSE4_2)) {
            Label labelPackVectorLoop = new Label();
            Label labelPack16Bytes = new Label();
            Label labelPack8Bytes = new Label();
            Label labelCopyTail = new Label();
            int vectorLength = this.vectorSize.getBytes() / strideDst.value;
            asm.movl(tmp, len);
            asm.andl(tmp, vectorLength - 1);
            asm.andlAndJcc(len, -vectorLength, AMD64Assembler.ConditionFlag.Zero, this.supportsAVX2AndYMM() ? labelPack16Bytes : labelPack8Bytes, false);
            if (this.supportsAVX2AndYMM() && op == Op.compressIntToByte) {
                this.loadMask(crb, asm, ValueUtil.asRegister((Value)this.vectorTemp[4]), AMD64ArrayCopyWithConversionsOp.getAVX2IntToBytePackingUnscrambleMap());
            }
            asm.leaq(src, new AMD64Address(src, len, strideSrc));
            asm.leaq(dst, new AMD64Address(dst, len, strideDst));
            asm.negq(len);
            asm.align(this.preferredLoopAlignment(crb));
            asm.bind(labelPackVectorLoop);
            this.packVector(asm, this.vectorSize, op, strideSrc, strideDst, src, dst, len, 0, false);
            asm.addqAndJcc(len, vectorLength, AMD64Assembler.ConditionFlag.NotZero, labelPackVectorLoop, true);
            this.packVector(asm, this.vectorSize, op, strideSrc, strideDst, src, dst, tmp, -this.vectorSize.getBytes(), false);
            asm.jmp(labelDone);
            if (this.supportsAVX2AndYMM()) {
                asm.bind(labelPack16Bytes);
                int vectorSizeXMM = AVXKind.AVXSize.XMM.getBytes() / strideDst.value;
                asm.cmplAndJcc(tmp, vectorSizeXMM, AMD64Assembler.ConditionFlag.Less, labelPack8Bytes, true);
                this.packVector(asm, AVXKind.AVXSize.XMM, op, strideSrc, strideDst, src, dst, len, 0, true);
                this.packVector(asm, AVXKind.AVXSize.XMM, op, strideSrc, strideDst, src, dst, tmp, -AVXKind.AVXSize.XMM.getBytes(), false);
                asm.jmpb(labelDone);
            }
            asm.bind(labelPack8Bytes);
            int vectorSizeQ = 8 / strideDst.value;
            asm.cmplAndJcc(tmp, vectorSizeQ, AMD64Assembler.ConditionFlag.Less, labelCopyTail, true);
            this.pack8Bytes(asm, op, strideSrc, strideDst, src, dst, len, 0, true);
            this.pack8Bytes(asm, op, strideSrc, strideDst, src, dst, tmp, -8, false);
            asm.jmpb(labelDone);
            asm.bind(labelCopyTail);
            asm.movl(len, tmp);
        }
        asm.testlAndJcc(len, len, AMD64Assembler.ConditionFlag.Zero, labelDone, true);
        asm.leaq(src, new AMD64Address(src, len, strideSrc));
        asm.leaq(dst, new AMD64Address(dst, len, strideDst));
        asm.negq(len);
        asm.bind(labelScalarLoop);
        switch (op.ordinal()) {
            case 0: {
                asm.movzwl(tmp, new AMD64Address(src, len, strideSrc));
                asm.movb(new AMD64Address(dst, len, strideDst), tmp);
                break;
            }
            case 1: {
                asm.movl(tmp, new AMD64Address(src, len, strideSrc));
                asm.movw(new AMD64Address(dst, len, strideDst), tmp);
                break;
            }
            case 2: {
                asm.movl(tmp, new AMD64Address(src, len, strideSrc));
                asm.movb(new AMD64Address(dst, len, strideDst), tmp);
            }
        }
        asm.incqAndJcc(len, AMD64Assembler.ConditionFlag.NotZero, labelScalarLoop, true);
        asm.bind(labelDone);
    }

    private void packVector(AMD64MacroAssembler asm, AVXKind.AVXSize vecSize, Op op, Stride strideSrc, Stride strideDst, Register src, Register dst, Register index, int displacement, boolean direct) {
        int displacementSrc = displacement << strideSrc.log2 - strideDst.log2;
        Register vec1 = ValueUtil.asRegister((Value)this.vectorTemp[0]);
        Register vec2 = ValueUtil.asRegister((Value)this.vectorTemp[1]);
        asm.movdqu(vecSize, vec1, AMD64ArrayCopyWithConversionsOp.indexAddressOrDirect(strideSrc, src, index, displacementSrc, 0, direct));
        asm.movdqu(vecSize, vec2, AMD64ArrayCopyWithConversionsOp.indexAddressOrDirect(strideSrc, src, index, displacementSrc, vecSize.getBytes(), direct));
        switch (op.ordinal()) {
            case 0: {
                asm.packuswb(vecSize, vec1, vec2);
                break;
            }
            case 1: {
                asm.packusdw(vecSize, vec1, vec2);
                break;
            }
            case 2: {
                Register vec3 = ValueUtil.asRegister((Value)this.vectorTemp[2]);
                Register vec4 = ValueUtil.asRegister((Value)this.vectorTemp[3]);
                asm.movdqu(vecSize, vec3, AMD64ArrayCopyWithConversionsOp.indexAddressOrDirect(strideSrc, src, index, displacementSrc, vecSize.getBytes() * 2, direct));
                asm.movdqu(vecSize, vec4, AMD64ArrayCopyWithConversionsOp.indexAddressOrDirect(strideSrc, src, index, displacementSrc, vecSize.getBytes() * 3, direct));
                asm.packusdw(vecSize, vec1, vec2);
                asm.packusdw(vecSize, vec3, vec4);
                asm.packuswb(vecSize, vec1, vec3);
            }
        }
        if (vecSize == AVXKind.AVXSize.YMM) {
            if (op == Op.compressIntToByte) {
                AMD64Assembler.VexRVMOp.VPERMD.emit((AMD64Assembler)asm, vecSize, vec1, ValueUtil.asRegister((Value)this.vectorTemp[4]), vec1);
            } else {
                AMD64Assembler.VexRMIOp.VPERMQ.emit((AMD64Assembler)asm, vecSize, vec1, vec1, 216);
            }
        }
        asm.movdqu(vecSize, new AMD64Address(dst, index, strideDst, displacement), vec1);
    }

    private void pack8Bytes(AMD64MacroAssembler masm, Op op, Stride strideSrc, Stride strideDst, Register src, Register dst, Register index, int displacement, boolean direct) {
        Register vec1 = ValueUtil.asRegister((Value)this.vectorTemp[0]);
        Register vec2 = ValueUtil.asRegister((Value)this.vectorTemp[1]);
        int displacementSrc = displacement << strideSrc.log2 - strideDst.log2;
        masm.movdqu(vec1, AMD64ArrayCopyWithConversionsOp.indexAddressOrDirect(strideSrc, src, index, displacementSrc, 0, direct));
        switch (op.ordinal()) {
            case 0: {
                masm.pxor(vec2, vec2);
                masm.packuswb(vec1, vec2);
                break;
            }
            case 1: {
                masm.pxor(vec2, vec2);
                masm.packusdw(vec1, vec2);
                break;
            }
            case 2: {
                masm.movdqu(vec2, AMD64ArrayCopyWithConversionsOp.indexAddressOrDirect(strideSrc, src, index, displacementSrc, 16, direct));
                masm.packusdw(vec1, vec2);
                masm.packuswb(vec1, vec2);
            }
        }
        masm.movq(new AMD64Address(dst, index, strideDst, displacement), vec1);
    }

    private static AMD64Address indexAddressOrDirect(Stride strideSrc, Register array, Register index, int baseOffset, int displacement, boolean direct) {
        return direct ? new AMD64Address(array, displacement) : new AMD64Address(array, index, strideSrc, baseOffset + displacement);
    }

    private void emitInflate(CompilationResultBuilder crb, AMD64MacroAssembler asm, Stride strideSrc, Stride strideDst, Register src, Register dst, Register len, Register tmp) {
        Op op = AMD64ArrayCopyWithConversionsOp.getOp(strideDst, strideSrc);
        Register vec = ValueUtil.asRegister((Value)this.vectorTemp[0]);
        Label labelScalarLoop = new Label();
        Label labelDone = new Label();
        asm.movl(tmp, len);
        if (asm.supports(AMD64.CPUFeature.SSE4_2)) {
            Label labelMainLoop = new Label();
            Label labelSkipXMMHalf = new Label();
            Label labelXMMTail = new Label();
            Label labelTail = new Label();
            int vectorLength = this.vectorSize.getBytes() / strideDst.value;
            int vectorLengthXMM = AVXKind.AVXSize.XMM.getBytes() / strideDst.value;
            int scaleDelta = strideDst.log2 - strideSrc.log2;
            asm.andl(tmp, vectorLength - 1);
            asm.andlAndJcc(len, -vectorLength, AMD64Assembler.ConditionFlag.Zero, this.supportsAVX2AndYMM() ? labelXMMTail : labelTail, true);
            asm.leaq(src, new AMD64Address(src, len, strideSrc));
            asm.leaq(dst, new AMD64Address(dst, len, strideDst));
            asm.negq(len);
            asm.align(this.preferredLoopAlignment(crb));
            asm.bind(labelMainLoop);
            asm.pmovSZx(this.vectorSize, this.extendMode, vec, strideDst, new AMD64Address(src, len, strideSrc), strideSrc);
            asm.movdqu(this.vectorSize, new AMD64Address(dst, len, strideDst), vec);
            asm.addqAndJcc(len, vectorLength, AMD64Assembler.ConditionFlag.NotZero, labelMainLoop, true);
            asm.pmovSZx(this.vectorSize, this.extendMode, vec, strideDst, new AMD64Address(src, tmp, strideSrc, -this.vectorSize.getBytes() >> scaleDelta), strideSrc);
            asm.movdqu(this.vectorSize, new AMD64Address(dst, tmp, strideDst, -this.vectorSize.getBytes()), vec);
            asm.jmpb(labelDone);
            if (this.supportsAVX2AndYMM()) {
                asm.bind(labelXMMTail);
                asm.cmplAndJcc(tmp, vectorLengthXMM, AMD64Assembler.ConditionFlag.Less, labelTail, true);
                asm.pmovSZx(AVXKind.AVXSize.XMM, this.extendMode, vec, strideDst, new AMD64Address(src), strideSrc);
                asm.movdqu(new AMD64Address(dst), vec);
                asm.pmovSZx(AVXKind.AVXSize.XMM, this.extendMode, vec, strideDst, new AMD64Address(src, tmp, strideSrc, -16 >> scaleDelta), strideSrc);
                asm.movdqu(new AMD64Address(dst, tmp, strideDst, -16), vec);
                asm.jmpb(labelDone);
            }
            asm.bind(labelTail);
            asm.movl(len, tmp);
            if (op != Op.inflateByteToInt) {
                assert (scaleDelta == 1) : scaleDelta;
                asm.cmplAndJcc(len, 4 >> strideSrc.log2, AMD64Assembler.ConditionFlag.Less, labelSkipXMMHalf, true);
                asm.movdl(vec, new AMD64Address(src));
                asm.pmovSZx(AVXKind.AVXSize.XMM, this.extendMode, vec, strideDst, vec, strideSrc);
                asm.movq(new AMD64Address(dst), vec);
                asm.movdl(vec, new AMD64Address(src, len, strideSrc, -4));
                asm.pmovSZx(AVXKind.AVXSize.XMM, this.extendMode, vec, strideDst, vec, strideSrc);
                asm.movq(new AMD64Address(dst, len, strideDst, -8), vec);
                asm.jmpb(labelDone);
            }
            asm.bind(labelSkipXMMHalf);
        }
        asm.testlAndJcc(len, len, AMD64Assembler.ConditionFlag.Zero, labelDone, true);
        asm.leaq(src, new AMD64Address(src, len, strideSrc));
        asm.leaq(dst, new AMD64Address(dst, len, strideDst));
        asm.negq(len);
        asm.bind(labelScalarLoop);
        switch (op.ordinal()) {
            case 3: {
                asm.movzbl(tmp, new AMD64Address(src, len, strideSrc));
                asm.movw(new AMD64Address(dst, len, strideDst), tmp);
                break;
            }
            case 4: {
                asm.movzbl(tmp, new AMD64Address(src, len, strideSrc));
                asm.movl(new AMD64Address(dst, len, strideDst), tmp);
                break;
            }
            case 5: {
                asm.movzwl(tmp, new AMD64Address(src, len, strideSrc));
                asm.movl(new AMD64Address(dst, len, strideDst), tmp);
            }
        }
        asm.incqAndJcc(len, AMD64Assembler.ConditionFlag.NotZero, labelScalarLoop, true);
        asm.bind(labelDone);
    }

    private void emitCopy(CompilationResultBuilder crb, AMD64MacroAssembler masm, Stride strideSrc, Stride strideDst, Register src, Register dst, Register len, Register tmp) {
        Register vec = ValueUtil.asRegister((Value)this.vectorTemp[0]);
        Label labelTailXMM = new Label();
        Label labelTailQWORD = new Label();
        Label labelTailDWORD = new Label();
        Label labelTailWORD = new Label();
        Label labelTailBYTE = new Label();
        Label labelDone = new Label();
        int vectorLength = this.vectorSize.getBytes() / strideDst.value;
        masm.movl(tmp, len);
        masm.andl(tmp, vectorLength - 1);
        masm.andlAndJcc(len, -vectorLength, AMD64Assembler.ConditionFlag.Zero, this.supportsAVX2AndYMM() ? labelTailXMM : labelTailQWORD, true);
        masm.leaq(src, new AMD64Address(src, len, strideSrc));
        masm.leaq(dst, new AMD64Address(dst, len, strideDst));
        masm.negq(len);
        Label labelYMMLoop = new Label();
        Label labelXMMLoop = new Label();
        if (this.supportsAVX2AndYMM()) {
            masm.align(this.preferredLoopAlignment(crb));
            masm.bind(labelYMMLoop);
            masm.vmovdqu(vec, new AMD64Address(src, len, strideSrc));
            masm.vmovdqu(new AMD64Address(dst, len, strideDst), vec);
            masm.addqAndJcc(len, vectorLength, AMD64Assembler.ConditionFlag.NotZero, labelYMMLoop, true);
            masm.vmovdqu(vec, new AMD64Address(src, tmp, strideSrc, -32));
            masm.vmovdqu(new AMD64Address(dst, tmp, strideDst, -32), vec);
            masm.jmpb(labelDone);
            masm.bind(labelTailXMM);
            masm.cmplAndJcc(tmp, 16 / strideDst.value, AMD64Assembler.ConditionFlag.Less, labelTailQWORD, true);
            masm.movdqu(vec, new AMD64Address(src));
            masm.movdqu(new AMD64Address(dst), vec);
            masm.movdqu(vec, new AMD64Address(src, tmp, strideSrc, -16));
            masm.movdqu(new AMD64Address(dst, tmp, strideDst, -16), vec);
            masm.jmpb(labelDone);
        } else {
            masm.align(this.preferredLoopAlignment(crb));
            masm.bind(labelXMMLoop);
            masm.movdqu(vec, new AMD64Address(src, len, strideSrc));
            masm.movdqu(new AMD64Address(dst, len, strideDst), vec);
            masm.addqAndJcc(len, vectorLength, AMD64Assembler.ConditionFlag.NotZero, labelXMMLoop, true);
            masm.movdqu(vec, new AMD64Address(src, tmp, strideSrc, -16));
            masm.movdqu(new AMD64Address(dst, tmp, strideDst, -16), vec);
            masm.jmpb(labelDone);
        }
        masm.bind(labelTailQWORD);
        masm.cmplAndJcc(tmp, 8 / strideDst.value, AMD64Assembler.ConditionFlag.Less, labelTailDWORD, true);
        masm.movq(len, new AMD64Address(src));
        masm.movq(new AMD64Address(dst), len);
        masm.movq(len, new AMD64Address(src, tmp, strideSrc, -8));
        masm.movq(new AMD64Address(dst, tmp, strideDst, -8), len);
        masm.jmpb(labelDone);
        masm.bind(labelTailDWORD);
        if (strideDst.value < 4) {
            masm.cmplAndJcc(tmp, 4 / strideDst.value, AMD64Assembler.ConditionFlag.Less, labelTailWORD, true);
        } else {
            masm.testlAndJcc(tmp, tmp, AMD64Assembler.ConditionFlag.Zero, labelDone, true);
        }
        masm.movl(len, new AMD64Address(src));
        masm.movl(new AMD64Address(dst), len);
        masm.movl(len, new AMD64Address(src, tmp, strideSrc, -4));
        masm.movl(new AMD64Address(dst, tmp, strideDst, -4), len);
        if (strideDst.value < 4) {
            masm.jmpb(labelDone);
            masm.bind(labelTailWORD);
            if (strideDst.value < 2) {
                masm.cmplAndJcc(tmp, 2 / strideDst.value, AMD64Assembler.ConditionFlag.Less, labelTailBYTE, true);
            } else {
                masm.testlAndJcc(tmp, tmp, AMD64Assembler.ConditionFlag.Zero, labelDone, true);
            }
            masm.movw(len, new AMD64Address(src));
            masm.movw(new AMD64Address(dst), len);
            masm.movw(len, new AMD64Address(src, tmp, strideSrc, -2));
            masm.movw(new AMD64Address(dst, tmp, strideDst, -2), len);
        }
        if (strideDst.value < 2) {
            masm.jmpb(labelDone);
            masm.bind(labelTailBYTE);
            masm.testlAndJcc(tmp, tmp, AMD64Assembler.ConditionFlag.Zero, labelDone, true);
            masm.movb(len, new AMD64Address(src));
            masm.movb(new AMD64Address(dst), len);
        }
        masm.bind(labelDone);
    }

    private static enum Op {
        compressCharToByte,
        compressIntToChar,
        compressIntToByte,
        inflateByteToChar,
        inflateByteToInt,
        inflateCharToInt,
        copy;

    }
}

