arch-x86: implement movntq/movntdq instructions

Non-temporal quadword/double-quadword move instructions.
This change ignores the non-temporal hint and instructions are
implemented to send cacheable request to memory.
This would have some "performance" impact (i.e. having some cache
pollution) to get better "correctness" in behavior.

Change-Id: I2052ac0970f61a54bafb7332762debcb7103202d
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5/+/20288
Tested-by: kokoro <noreply+kokoro@google.com>
Reviewed-by: Jason Lowe-Power <jason@lowepower.com>
Maintainer: Jason Lowe-Power <jason@lowepower.com>
diff --git a/src/arch/x86/isa/decoder/two_byte_opcodes.isa b/src/arch/x86/isa/decoder/two_byte_opcodes.isa
index 7a4f9e1..a23531c 100644
--- a/src/arch/x86/isa/decoder/two_byte_opcodes.isa
+++ b/src/arch/x86/isa/decoder/two_byte_opcodes.isa
@@ -978,7 +978,9 @@
                     0x3: PAVGW(Pq,Qq);
                     0x4: PMULHUW(Pq,Qq);
                     0x5: PMULHW(Pq,Qq);
-                    0x7: WarnUnimpl::movntq_Mq_Pq();
+                    //Non-temporal hint is ignored since we don't have
+                    //proper support for it in the memory system.
+                    0x7: MOVNTQ(Mq,Pq);
                     default: UD2();
                 }
                 // repe (0xF3)
@@ -995,7 +997,11 @@
                     0x4: PMULHUW(Vo,Wo);
                     0x5: PMULHW(Vo,Wo);
                     0x6: CVTTPD2DQ(Vo,Wo);
-                    0x7: WarnUnimpl::movntdq_Mo_Vo();
+                    //MOVNTDQ should really use size o (octword), but
+                    //because it is split in two, we use q (quadword).
+                    //Non-temporal hint is ignored since we don't have
+                    //proper support for it in the memory system.
+                    0x7: MOVNTDQ(Mq,Vq);
                 }
                 // repne (0xF2)
                 0x8: decode OPCODE_OP_BOTTOM3 {
diff --git a/src/arch/x86/isa/insts/simd128/integer/data_transfer/move_non_temporal.py b/src/arch/x86/isa/insts/simd128/integer/data_transfer/move_non_temporal.py
index 792153a..08296bd 100644
--- a/src/arch/x86/isa/insts/simd128/integer/data_transfer/move_non_temporal.py
+++ b/src/arch/x86/isa/insts/simd128/integer/data_transfer/move_non_temporal.py
@@ -34,9 +34,23 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #
 # Authors: Gabe Black
+#          Pouya Fotouhi
 
 microcode = '''
-# MOVNTDQ
+def macroop MOVNTDQ_M_XMM {
+    warn "MOVNTDQ: Ignoring non-temporal hint, modeling as cacheable!"
+    cda seg, sib, "DISPLACEMENT + 8", dataSize=8
+    stfp xmml, seg, sib, "DISPLACEMENT", dataSize=8
+    stfp xmmh, seg, sib, "DISPLACEMENT + 8", dataSize=8
+};
+
+def macroop MOVNTDQ_P_XMM {
+    warn "MOVNTDQ_P: Ignoring non-temporal hint, modeling as cacheable!"
+    rdip t7
+    cda seg, riprel, "DISPLACEMENT + 8", dataSize=8
+    stfp xmml, seg, riprel, "DISPLACEMENT", dataSize=8
+    stfp xmmh, seg, riprel, "DISPLACEMENT + 8", dataSize=8
+};
 
 def macroop MASKMOVDQU_XMM_XMM {
     ldfp ufp1, ds, [1, t0, rdi], dataSize=8
diff --git a/src/arch/x86/isa/insts/simd64/integer/data_transfer/move_non_temporal.py b/src/arch/x86/isa/insts/simd64/integer/data_transfer/move_non_temporal.py
index f068a06..ccd37f1 100644
--- a/src/arch/x86/isa/insts/simd64/integer/data_transfer/move_non_temporal.py
+++ b/src/arch/x86/isa/insts/simd64/integer/data_transfer/move_non_temporal.py
@@ -34,9 +34,19 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #
 # Authors: Gabe Black
+#          Pouya Fotouhi
 
 microcode = '''
-# MOVNTQ
+def macroop MOVNTQ_M_MMX {
+    warn "MOVNTQ: Ignoring non-temporal hint, modeling as cacheable!"
+    stfp mmx, seg, sib, "DISPLACEMENT", dataSize=8
+};
+
+def macroop MOVNTQ_P_MMX {
+    warn "MOVNTQ_P: Ignoring non-temporal hint, modeling as cacheable!"
+    rdip t7
+    stfp mmx, seg, riprel, "DISPLACEMENT", dataSize=8
+};
 
 def macroop MASKMOVQ_MMX_MMX {
     ldfp ufp1, ds, [1, t0, rdi], dataSize=8