81 files changed, 281 insertions, 259 deletions
diff --git a/devtools/checkpatches.sh b/devtools/checkpatches.sh
index c47ea59501..8f245ebdab 100755
--- a/devtools/checkpatches.sh
+++ b/devtools/checkpatches.sh
@@ -336,6 +336,21 @@ check_internal_tags() { # <patch>
 	return $res
 }
 
+check_aligned_attributes() { # <patch>
+	res=0
+
+	for token in __rte_aligned __rte_cache_aligned __rte_cache_min_aligned; do
+		if [ $(grep -E '^\+.*\<'$token'\>' "$1" | \
+				grep -vE '\<(struct|union)[[:space:]]*'$token'\>' | \
+				wc -l) != 0 ]; then
+			echo "Please use $token only for struct or union types alignment."
+			res=1
+		fi
+	done
+
+	return $res
+}
+
 check_release_notes() { # <patch>
 	rel_notes_prefix=doc/guides/rel_notes/release_
 	IFS=. read year month release < VERSION
@@ -445,6 +460,14 @@ check () { # <patch-file> <commit>
 		ret=1
 	fi
 
+	! $verbose || printf '\nChecking alignment attributes:\n'
+	report=$(check_aligned_attributes "$tmpinput")
+	if [ $? -ne 0 ] ; then
+		$headline_printed || print_headline "$subject"
+		printf '%s\n' "$report"
+		ret=1
+	fi
+
 	! $verbose || printf '\nChecking release notes updates:\n'
 	report=$(check_release_notes "$tmpinput")
 	if [ $? -ne 0 ] ; then
diff --git a/lib/acl/acl_run.h b/lib/acl/acl_run.h
index 7d215de9d6..7f092413cd 100644
--- a/lib/acl/acl_run.h
+++ b/lib/acl/acl_run.h
@@ -55,12 +55,12 @@ struct acl_flow_data {
  * Structure to maintain running results for
  * a single packet (up to 4 tries).
  */
-struct completion {
+struct __rte_aligned(XMM_SIZE) completion {
 	uint32_t *results;                          /* running results. */
 	int32_t   priority[RTE_ACL_MAX_CATEGORIES]; /* running priorities. */
 	uint32_t  count;                            /* num of remaining tries */
 	/* true for allocated struct */
-} __rte_aligned(XMM_SIZE);
+};
 
 /*
  * One parms structure for each slot in the search engine.
diff --git a/lib/bpf/bpf_pkt.c b/lib/bpf/bpf_pkt.c
index 793a75ded9..aaca935f2d 100644
--- a/lib/bpf/bpf_pkt.c
+++ b/lib/bpf/bpf_pkt.c
@@ -23,7 +23,7 @@
  * information about installed BPF rx/tx callback
  */
 
-struct bpf_eth_cbi {
+struct __rte_cache_aligned bpf_eth_cbi {
 	/* used by both data & control path */
 	RTE_ATOMIC(uint32_t) use;    /*usage counter */
 	const struct rte_eth_rxtx_callback *cb;  /* callback handle */
@@ -33,7 +33,7 @@ struct bpf_eth_cbi {
 	LIST_ENTRY(bpf_eth_cbi) link;
 	uint16_t port;
 	uint16_t queue;
-} __rte_cache_aligned;
+};
 
 /*
  * Odd number means that callback is used by datapath.
diff --git a/lib/compressdev/rte_comp.h b/lib/compressdev/rte_comp.h
index 3606ebf8e6..830a240b6b 100644
--- a/lib/compressdev/rte_comp.h
+++ b/lib/compressdev/rte_comp.h
@@ -356,7 +356,7 @@ struct rte_comp_xform {
  * Comp operations are enqueued and dequeued in comp PMDs using the
  * rte_compressdev_enqueue_burst() / rte_compressdev_dequeue_burst() APIs
  */
-struct rte_comp_op {
+struct __rte_cache_aligned rte_comp_op {
 	enum rte_comp_op_type op_type;
 	union {
 		void *private_xform;
@@ -478,7 +478,7 @@ struct rte_comp_op {
 	 * will be set to RTE_COMP_OP_STATUS_SUCCESS after operation
 	 * is successfully processed by a PMD
 	 */
-} __rte_cache_aligned;
+};
 
 /**
  * Creates an operation pool
diff --git a/lib/compressdev/rte_compressdev_internal.h b/lib/compressdev/rte_compressdev_internal.h
index 0bc8c874e0..67f8b51a37 100644
--- a/lib/compressdev/rte_compressdev_internal.h
+++ b/lib/compressdev/rte_compressdev_internal.h
@@ -69,7 +69,7 @@ typedef uint16_t (*compressdev_enqueue_pkt_burst_t)(void *qp,
 		struct rte_comp_op **ops, uint16_t nb_ops);
 
 /** The data structure associated with each comp device. */
-struct rte_compressdev {
+struct __rte_cache_aligned rte_compressdev {
 	compressdev_dequeue_pkt_burst_t dequeue_burst;
 	/**< Pointer to PMD receive function */
 	compressdev_enqueue_pkt_burst_t enqueue_burst;
@@ -87,7 +87,7 @@ struct rte_compressdev {
 	__extension__
 	uint8_t attached : 1;
 	/**< Flag indicating the device is attached */
-} __rte_cache_aligned;
+};
 
 /**
  *
@@ -96,7 +96,7 @@ struct rte_compressdev {
  * This structure is safe to place in shared memory to be common among
  * different processes in a multi-process configuration.
  */
-struct rte_compressdev_data {
+struct __rte_cache_aligned rte_compressdev_data {
 	uint8_t dev_id;
 	/**< Compress device identifier */
 	int socket_id;
@@ -115,7 +115,7 @@ struct rte_compressdev_data {
 
 	void *dev_private;
 	/**< PMD-specific private data */
-} __rte_cache_aligned;
+};
 
 #ifdef __cplusplus
 }
diff --git a/lib/cryptodev/cryptodev_pmd.h b/lib/cryptodev/cryptodev_pmd.h
index 9139975eea..d195b81771 100644
--- a/lib/cryptodev/cryptodev_pmd.h
+++ b/lib/cryptodev/cryptodev_pmd.h
@@ -61,7 +61,7 @@ struct rte_cryptodev_pmd_init_params {
  * This structure is safe to place in shared memory to be common among
  * different processes in a multi-process configuration.
  */
-struct rte_cryptodev_data {
+struct __rte_cache_aligned rte_cryptodev_data {
 	/** Device ID for this instance */
 	uint8_t dev_id;
 	/** Socket ID where memory is allocated */
@@ -82,10 +82,10 @@ struct rte_cryptodev_data {
 
 	/** PMD-specific private data */
 	void *dev_private;
-} __rte_cache_aligned;
+};
 
 /** @internal The data structure associated with each crypto device. */
-struct rte_cryptodev {
+struct __rte_cache_aligned rte_cryptodev {
 	/** Pointer to PMD dequeue function. */
 	dequeue_pkt_burst_t dequeue_burst;
 	/** Pointer to PMD enqueue function. */
@@ -117,7 +117,7 @@ struct rte_cryptodev {
 	struct rte_cryptodev_cb_rcu *enq_cbs;
 	/** User application callback for post dequeue processing */
 	struct rte_cryptodev_cb_rcu *deq_cbs;
-} __rte_cache_aligned;
+};
 
 /** Global structure used for maintaining state of allocated crypto devices */
 struct rte_cryptodev_global {
diff --git a/lib/cryptodev/rte_cryptodev_core.h b/lib/cryptodev/rte_cryptodev_core.h
index 5de89d099f..8d7e58d76d 100644
--- a/lib/cryptodev/rte_cryptodev_core.h
+++ b/lib/cryptodev/rte_cryptodev_core.h
@@ -40,7 +40,7 @@ struct rte_cryptodev_qpdata {
 	struct rte_cryptodev_cb_rcu *deq_cb;
 };
 
-struct rte_crypto_fp_ops {
+struct __rte_cache_aligned rte_crypto_fp_ops {
 	/** PMD enqueue burst function. */
 	enqueue_pkt_burst_t enqueue_burst;
 	/** PMD dequeue burst function. */
@@ -49,7 +49,7 @@ struct rte_crypto_fp_ops {
 	struct rte_cryptodev_qpdata qp;
 	/** Reserved for future ops. */
 	uintptr_t reserved[3];
-} __rte_cache_aligned;
+};
 
 extern struct rte_crypto_fp_ops rte_crypto_fp_ops[RTE_CRYPTO_MAX_DEVS];
 
diff --git a/lib/dispatcher/rte_dispatcher.c b/lib/dispatcher/rte_dispatcher.c
index f546d75c0b..79349172bc 100644
--- a/lib/dispatcher/rte_dispatcher.c
+++ b/lib/dispatcher/rte_dispatcher.c
@@ -41,7 +41,7 @@ struct rte_dispatcher_finalizer {
 	void *finalize_data;
 };
 
-struct rte_dispatcher_lcore {
+struct __rte_cache_aligned rte_dispatcher_lcore {
 	uint8_t num_ports;
 	uint16_t num_handlers;
 	int32_t prio_count;
@@ -49,7 +49,7 @@ struct rte_dispatcher_lcore {
 	struct rte_dispatcher_handler handlers[EVD_MAX_HANDLERS];
 	struct rte_dispatcher_stats stats;
 	RTE_CACHE_GUARD;
-} __rte_cache_aligned;
+};
 
 struct rte_dispatcher {
 	uint8_t event_dev_id;
diff --git a/lib/distributor/distributor_private.h b/lib/distributor/distributor_private.h
index caa9b7af7a..07c2c05c64 100644
--- a/lib/distributor/distributor_private.h
+++ b/lib/distributor/distributor_private.h
@@ -53,10 +53,10 @@
  * the next cache line to worker 0, we pad this out to three cache lines.
  * Only 64-bits of the memory is actually used though.
  */
-union rte_distributor_buffer_single {
+union __rte_cache_aligned rte_distributor_buffer_single {
 	volatile RTE_ATOMIC(int64_t) bufptr64;
 	char pad[RTE_CACHE_LINE_SIZE*3];
-} __rte_cache_aligned;
+};
 
 /*
  * Transfer up to 8 mbufs at a time to/from workers, and
@@ -64,12 +64,12 @@ union rte_distributor_buffer_single {
  */
 #define RTE_DIST_BURST_SIZE 8
 
-struct rte_distributor_backlog {
+struct __rte_cache_aligned rte_distributor_backlog {
 	unsigned int start;
 	unsigned int count;
 	alignas(RTE_CACHE_LINE_SIZE) int64_t pkts[RTE_DIST_BURST_SIZE];
 	uint16_t *tags; /* will point to second cacheline of inflights */
-} __rte_cache_aligned;
+};
 
 
 struct rte_distributor_returned_pkts {
diff --git a/lib/dmadev/rte_dmadev_core.h b/lib/dmadev/rte_dmadev_core.h
index e8239c2d22..29f52514d7 100644
--- a/lib/dmadev/rte_dmadev_core.h
+++ b/lib/dmadev/rte_dmadev_core.h
@@ -61,7 +61,7 @@ typedef uint16_t (*rte_dma_burst_capacity_t)(const void *dev_private, uint16_t v
  * The 'dev_private' field was placed in the first cache line to optimize
  * performance because the PMD mainly depends on this field.
  */
-struct rte_dma_fp_object {
+struct __rte_cache_aligned rte_dma_fp_object {
 	/** PMD-specific private data. The driver should copy
 	 * rte_dma_dev.data->dev_private to this field during initialization.
 	 */
@@ -73,7 +73,7 @@ struct rte_dma_fp_object {
 	rte_dma_completed_t        completed;
 	rte_dma_completed_status_t completed_status;
 	rte_dma_burst_capacity_t   burst_capacity;
-} __rte_cache_aligned;
+};
 
 extern struct rte_dma_fp_object *rte_dma_fp_objs;
 
diff --git a/lib/dmadev/rte_dmadev_pmd.h b/lib/dmadev/rte_dmadev_pmd.h
index 7f354f619a..58729088ff 100644
--- a/lib/dmadev/rte_dmadev_pmd.h
+++ b/lib/dmadev/rte_dmadev_pmd.h
@@ -94,7 +94,7 @@ struct rte_dma_dev_ops {
  *
  * @see struct rte_dma_dev::data
  */
-struct rte_dma_dev_data {
+struct __rte_cache_aligned rte_dma_dev_data {
 	char dev_name[RTE_DEV_NAME_MAX_LEN]; /**< Unique identifier name */
 	int16_t dev_id; /**< Device [external] identifier. */
 	int16_t numa_node; /**< Local NUMA memory ID. -1 if unknown. */
@@ -103,7 +103,7 @@ struct rte_dma_dev_data {
 	__extension__
 	uint8_t dev_started : 1; /**< Device state: STARTED(1)/STOPPED(0). */
 	uint64_t reserved[2]; /**< Reserved for future fields */
-} __rte_cache_aligned;
+};
 
 /**
  * Possible states of a DMA device.
@@ -122,7 +122,7 @@ enum rte_dma_dev_state {
  * @internal
  * The generic data structure associated with each DMA device.
  */
-struct rte_dma_dev {
+struct __rte_cache_aligned rte_dma_dev {
 	/** Device info which supplied during device initialization. */
 	struct rte_device *device;
 	struct rte_dma_dev_data *data; /**< Pointer to shared device data. */
@@ -132,7 +132,7 @@ struct rte_dma_dev {
 	const struct rte_dma_dev_ops *dev_ops;
 	enum rte_dma_dev_state state; /**< Flag indicating the device state. */
 	uint64_t reserved[2]; /**< Reserved for future fields. */
-} __rte_cache_aligned;
+};
 
 /**
  * @internal
diff --git a/lib/eal/arm/include/rte_vect.h b/lib/eal/arm/include/rte_vect.h
index 8cfe4bddc1..c97d299a3e 100644
--- a/lib/eal/arm/include/rte_vect.h
+++ b/lib/eal/arm/include/rte_vect.h
@@ -24,14 +24,14 @@ typedef int32x4_t xmm_t;
 #define	XMM_SIZE	(sizeof(xmm_t))
 #define	XMM_MASK	(XMM_SIZE - 1)
 
-typedef union rte_xmm {
+typedef union __rte_aligned(16) rte_xmm {
 	xmm_t    x;
 	uint8_t  u8[XMM_SIZE / sizeof(uint8_t)];
 	uint16_t u16[XMM_SIZE / sizeof(uint16_t)];
 	uint32_t u32[XMM_SIZE / sizeof(uint32_t)];
 	uint64_t u64[XMM_SIZE / sizeof(uint64_t)];
 	double   pd[XMM_SIZE / sizeof(double)];
-} __rte_aligned(16) rte_xmm_t;
+} rte_xmm_t;
 
 #if defined(RTE_ARCH_ARM) && defined(RTE_ARCH_32)
 /* NEON intrinsic vqtbl1q_u8() is not supported in ARMv7-A(AArch32) */
diff --git a/lib/eal/common/malloc_elem.h b/lib/eal/common/malloc_elem.h
index 952ce7343b..c7ff6718f8 100644
--- a/lib/eal/common/malloc_elem.h
+++ b/lib/eal/common/malloc_elem.h
@@ -20,7 +20,7 @@ enum elem_state {
 	ELEM_PAD  /* element is a padding-only header */
 };
 
-struct malloc_elem {
+struct __rte_cache_aligned malloc_elem {
 	struct malloc_heap *heap;
 	struct malloc_elem *volatile prev;
 	/**< points to prev elem in memseg */
@@ -48,7 +48,7 @@ struct malloc_elem {
 	size_t user_size;
 	uint64_t asan_cookie[2]; /* must be next to header_cookie */
 #endif
-} __rte_cache_aligned;
+};
 
 static const unsigned int MALLOC_ELEM_HEADER_LEN = sizeof(struct malloc_elem);
 
diff --git a/lib/eal/common/malloc_heap.h b/lib/eal/common/malloc_heap.h
index 8f3ab57154..0c49588005 100644
--- a/lib/eal/common/malloc_heap.h
+++ b/lib/eal/common/malloc_heap.h
@@ -21,7 +21,7 @@ struct malloc_elem;
 /**
  * Structure to hold malloc heap
  */
-struct malloc_heap {
+struct __rte_cache_aligned malloc_heap {
 	rte_spinlock_t lock;
 	LIST_HEAD(, malloc_elem) free_head[RTE_HEAP_NUM_FREELISTS];
 	struct malloc_elem *volatile first;
@@ -31,7 +31,7 @@ struct malloc_heap {
 	unsigned int socket_id;
 	size_t total_size;
 	char name[RTE_HEAP_NAME_MAX_LEN];
-} __rte_cache_aligned;
+};
 
 void *
 malloc_heap_alloc(const char *type, size_t size, int socket, unsigned int flags,
diff --git a/lib/eal/common/rte_random.c b/lib/eal/common/rte_random.c
index 7709b8f2c6..90e91b3c4f 100644
--- a/lib/eal/common/rte_random.c
+++ b/lib/eal/common/rte_random.c
@@ -13,14 +13,14 @@
 #include <rte_lcore.h>
 #include <rte_random.h>
 
-struct rte_rand_state {
+struct __rte_cache_aligned rte_rand_state {
 	uint64_t z1;
 	uint64_t z2;
 	uint64_t z3;
 	uint64_t z4;
 	uint64_t z5;
 	RTE_CACHE_GUARD;
-} __rte_cache_aligned;
+};
 
 /* One instance each for every lcore id-equipped thread, and one
  * additional instance to be shared by all others threads (i.e., all
diff --git a/lib/eal/common/rte_service.c b/lib/eal/common/rte_service.c
index d959c91459..56379930b6 100644
--- a/lib/eal/common/rte_service.c
+++ b/lib/eal/common/rte_service.c
@@ -32,7 +32,7 @@
 #define RUNSTATE_RUNNING 1
 
 /* internal representation of a service */
-struct rte_service_spec_impl {
+struct __rte_cache_aligned rte_service_spec_impl {
 	/* public part of the struct */
 	struct rte_service_spec spec;
 
@@ -53,7 +53,7 @@ struct rte_service_spec_impl {
 	 * on currently.
 	 */
 	RTE_ATOMIC(uint32_t) num_mapped_cores;
-} __rte_cache_aligned;
+};
 
 struct service_stats {
 	RTE_ATOMIC(uint64_t) calls;
@@ -61,7 +61,7 @@ struct service_stats {
 };
 
 /* the internal values of a service core */
-struct core_state {
+struct __rte_cache_aligned core_state {
 	/* map of services IDs are run on this core */
 	uint64_t service_mask;
 	RTE_ATOMIC(uint8_t) runstate; /* running or stopped */
@@ -71,7 +71,7 @@ struct core_state {
 	RTE_ATOMIC(uint64_t) loops;
 	RTE_ATOMIC(uint64_t) cycles;
 	struct service_stats service_stats[RTE_SERVICE_NUM_MAX];
-} __rte_cache_aligned;
+};
 
 static uint32_t rte_service_count;
 static struct rte_service_spec_impl *rte_services;
diff --git a/lib/eal/include/generic/rte_atomic.h b/lib/eal/include/generic/rte_atomic.h
index 0e639dad76..f859707744 100644
--- a/lib/eal/include/generic/rte_atomic.h
+++ b/lib/eal/include/generic/rte_atomic.h
@@ -1094,7 +1094,7 @@ static inline void rte_atomic64_clear(rte_atomic64_t *v)
 /**
  * 128-bit integer structure.
  */
-typedef struct {
+typedef struct __rte_aligned(16) {
 	union {
 		uint64_t val[2];
 #ifdef RTE_ARCH_64
@@ -1103,7 +1103,7 @@ typedef struct {
 #endif
 #endif
 	};
-} __rte_aligned(16) rte_int128_t;
+} rte_int128_t;
 
 #ifdef __DOXYGEN__
 
diff --git a/lib/eal/include/rte_common.h b/lib/eal/include/rte_common.h
index 1362eec66d..dd8b845ed3 100644
--- a/lib/eal/include/rte_common.h
+++ b/lib/eal/include/rte_common.h
@@ -64,10 +64,19 @@ extern "C" {
 #endif
 
 /**
- * Force alignment
+ * Force type alignment
+ *
+ * This macro should be used when alignment of a struct or union type
+ * is required. For toolchain compatibility it should appear between
+ * the {struct,union} keyword and tag. e.g.
+ *
+ *   struct __rte_aligned(8) tag { ... };
+ *
+ * If alignment of an object/variable is required then this macro should
+ * not be used, instead prefer C11 alignas(a).
  */
 #ifdef RTE_TOOLCHAIN_MSVC
-#define __rte_aligned(a)
+#define __rte_aligned(a) __declspec(align(a))
 #else
 #define __rte_aligned(a) __attribute__((__aligned__(a)))
 #endif
@@ -539,11 +548,7 @@ rte_is_aligned(const void * const __rte_restrict ptr, const unsigned int align)
 #define RTE_CACHE_LINE_MIN_SIZE 64
 
 /** Force alignment to cache line. */
-#ifdef RTE_TOOLCHAIN_MSVC
-#define __rte_cache_aligned
-#else
 #define __rte_cache_aligned __rte_aligned(RTE_CACHE_LINE_SIZE)
-#endif
 
 /** Force minimum cache line alignment. */
 #define __rte_cache_min_aligned __rte_aligned(RTE_CACHE_LINE_MIN_SIZE)
diff --git a/lib/eal/loongarch/include/rte_vect.h b/lib/eal/loongarch/include/rte_vect.h
index 15465151e1..aa334e869a 100644
--- a/lib/eal/loongarch/include/rte_vect.h
+++ b/lib/eal/loongarch/include/rte_vect.h
@@ -15,7 +15,7 @@ extern "C" {
 
 #define RTE_VECT_DEFAULT_SIMD_BITWIDTH RTE_VECT_SIMD_DISABLED
 
-typedef union xmm {
+typedef union __rte_aligned(16) xmm {
 	int8_t   i8[16];
 	int16_t  i16[8];
 	int32_t  i32[4];
@@ -25,19 +25,19 @@ typedef union xmm {
 	uint32_t u32[4];
 	uint64_t u64[2];
 	double   pd[2];
-} __rte_aligned(16) xmm_t;
+} xmm_t;
 
 #define XMM_SIZE        (sizeof(xmm_t))
 #define XMM_MASK        (XMM_SIZE - 1)
 
-typedef union rte_xmm {
+typedef union __rte_aligned(16) rte_xmm {
 	xmm_t	 x;
 	uint8_t	 u8[XMM_SIZE / sizeof(uint8_t)];
 	uint16_t u16[XMM_SIZE / sizeof(uint16_t)];
 	uint32_t u32[XMM_SIZE / sizeof(uint32_t)];
 	uint64_t u64[XMM_SIZE / sizeof(uint64_t)];
 	double   pd[XMM_SIZE / sizeof(double)];
-} __rte_aligned(16) rte_xmm_t;
+} rte_xmm_t;
 
 static inline xmm_t
 vect_load_128(void *p)
diff --git a/lib/eal/ppc/include/rte_vect.h b/lib/eal/ppc/include/rte_vect.h
index a5f009b7df..c8bace2d75 100644
--- a/lib/eal/ppc/include/rte_vect.h
+++ b/lib/eal/ppc/include/rte_vect.h
@@ -22,14 +22,14 @@ typedef __vector signed int xmm_t;
 #define	XMM_SIZE	(sizeof(xmm_t))
 #define	XMM_MASK	(XMM_SIZE - 1)
 
-typedef union rte_xmm {
+typedef union __rte_aligned(16) rte_xmm {
 	xmm_t    x;
 	uint8_t  u8[XMM_SIZE / sizeof(uint8_t)];
 	uint16_t u16[XMM_SIZE / sizeof(uint16_t)];
 	uint32_t u32[XMM_SIZE / sizeof(uint32_t)];
 	uint64_t u64[XMM_SIZE / sizeof(uint64_t)];
 	double   pd[XMM_SIZE / sizeof(double)];
-} __rte_aligned(16) rte_xmm_t;
+} rte_xmm_t;
 
 #ifdef __cplusplus
 }
diff --git a/lib/eal/riscv/include/rte_vect.h b/lib/eal/riscv/include/rte_vect.h
index da9092a94a..6df10fa8ee 100644
--- a/lib/eal/riscv/include/rte_vect.h
+++ b/lib/eal/riscv/include/rte_vect.h
@@ -22,14 +22,14 @@ typedef int32_t		xmm_t __attribute__((vector_size(16)));
 #define XMM_SIZE	(sizeof(xmm_t))
 #define XMM_MASK	(XMM_SIZE - 1)
 
-typedef union rte_xmm {
+typedef union __rte_aligned(16) rte_xmm {
 	xmm_t		x;
 	uint8_t		u8[XMM_SIZE / sizeof(uint8_t)];
 	uint16_t	u16[XMM_SIZE / sizeof(uint16_t)];
 	uint32_t	u32[XMM_SIZE / sizeof(uint32_t)];
 	uint64_t	u64[XMM_SIZE / sizeof(uint64_t)];
 	double		pd[XMM_SIZE / sizeof(double)];
-} __rte_aligned(16) rte_xmm_t;
+} rte_xmm_t;
 
 static inline xmm_t
 vect_load_128(void *p)
diff --git a/lib/eal/x86/include/rte_vect.h b/lib/eal/x86/include/rte_vect.h
index a3e1ffe405..5ac3ccfd82 100644
--- a/lib/eal/x86/include/rte_vect.h
+++ b/lib/eal/x86/include/rte_vect.h
@@ -94,7 +94,7 @@ __extension__ ({                 \
 #define RTE_X86_ZMM_SIZE	(sizeof(__m512i))
 #define RTE_X86_ZMM_MASK	(RTE_X86_ZMM_SIZE - 1)
 
-typedef union __rte_x86_zmm {
+typedef union __rte_aligned(RTE_X86_ZMM_SIZE) __rte_x86_zmm {
 	__m512i	 z;
 	ymm_t    y[RTE_X86_ZMM_SIZE / sizeof(ymm_t)];
 	xmm_t    x[RTE_X86_ZMM_SIZE / sizeof(xmm_t)];
@@ -103,7 +103,7 @@ typedef union __rte_x86_zmm {
 	uint32_t u32[RTE_X86_ZMM_SIZE / sizeof(uint32_t)];
 	uint64_t u64[RTE_X86_ZMM_SIZE / sizeof(uint64_t)];
 	double   pd[RTE_X86_ZMM_SIZE / sizeof(double)];
-} __rte_aligned(RTE_X86_ZMM_SIZE) __rte_x86_zmm_t;
+} __rte_x86_zmm_t;
 
 #endif /* __AVX512F__ */
 
diff --git a/lib/ethdev/ethdev_driver.h b/lib/ethdev/ethdev_driver.h
index 0b088fdab7..0dbf2dd6a2 100644
--- a/lib/ethdev/ethdev_driver.h
+++ b/lib/ethdev/ethdev_driver.h
@@ -48,7 +48,7 @@ struct rte_eth_rxtx_callback {
  * memory. This split allows the function pointer and driver data to be per-
  * process, while the actual configuration data for the device is shared.
  */
-struct rte_eth_dev {
+struct __rte_cache_aligned rte_eth_dev {
 	eth_rx_burst_t rx_pkt_burst; /**< Pointer to PMD receive function */
 	eth_tx_burst_t tx_pkt_burst; /**< Pointer to PMD transmit function */
 
@@ -93,7 +93,7 @@ struct rte_eth_dev {
 
 	enum rte_eth_dev_state state; /**< Flag indicating the port state */
 	void *security_ctx; /**< Context for security ops */
-} __rte_cache_aligned;
+};
 
 struct rte_eth_dev_sriov;
 struct rte_eth_dev_owner;
@@ -104,7 +104,7 @@ struct rte_eth_dev_owner;
  * device. This structure is safe to place in shared memory to be common
  * among different processes in a multi-process configuration.
  */
-struct rte_eth_dev_data {
+struct __rte_cache_aligned rte_eth_dev_data {
 	char name[RTE_ETH_NAME_MAX_LEN]; /**< Unique identifier name */
 
 	void **rx_queues; /**< Array of pointers to Rx queues */
@@ -190,7 +190,7 @@ struct rte_eth_dev_data {
 	uint16_t backer_port_id;
 
 	pthread_mutex_t flow_ops_mutex; /**< rte_flow ops mutex */
-} __rte_cache_aligned;
+};
 
 /**
  * @internal
diff --git a/lib/ethdev/rte_ethdev.h b/lib/ethdev/rte_ethdev.h
index ed27360447..147257d6a2 100644
--- a/lib/ethdev/rte_ethdev.h
+++ b/lib/ethdev/rte_ethdev.h
@@ -333,12 +333,12 @@ struct rte_eth_stats {
  * A structure used to retrieve link-level information of an Ethernet port.
  */
 __extension__
-struct rte_eth_link {
+struct __rte_aligned(8) rte_eth_link { /**< aligned for atomic64 read/write */
 	uint32_t link_speed;        /**< RTE_ETH_SPEED_NUM_ */
 	uint16_t link_duplex  : 1;  /**< RTE_ETH_LINK_[HALF/FULL]_DUPLEX */
 	uint16_t link_autoneg : 1;  /**< RTE_ETH_LINK_[AUTONEG/FIXED] */
 	uint16_t link_status  : 1;  /**< RTE_ETH_LINK_[DOWN/UP] */
-} __rte_aligned(8);      /**< aligned for atomic64 read/write */
+};
 
 /**@{@name Link negotiation
  * Constants used in link management.
@@ -1836,7 +1836,7 @@ struct rte_eth_dev_info {
  * Ethernet device Rx queue information structure.
  * Used to retrieve information about configured queue.
  */
-struct rte_eth_rxq_info {
+struct __rte_cache_min_aligned rte_eth_rxq_info {
 	struct rte_mempool *mp;     /**< mempool used by that queue. */
 	struct rte_eth_rxconf conf; /**< queue config parameters. */
 	uint8_t scattered_rx;       /**< scattered packets Rx supported. */
@@ -1850,17 +1850,17 @@ struct rte_eth_rxq_info {
 	 * Value 0 means that the threshold monitoring is disabled.
 	 */
 	uint8_t avail_thresh;
-} __rte_cache_min_aligned;
+};
 
 /**
  * Ethernet device Tx queue information structure.
  * Used to retrieve information about configured queue.
  */
-struct rte_eth_txq_info {
+struct __rte_cache_min_aligned rte_eth_txq_info {
 	struct rte_eth_txconf conf; /**< queue config parameters. */
 	uint16_t nb_desc;           /**< configured number of TXDs. */
 	uint8_t queue_state;        /**< one of RTE_ETH_QUEUE_STATE_*. */
-} __rte_cache_min_aligned;
+};
 
 /**
  * @warning
@@ -1870,7 +1870,7 @@ struct rte_eth_txq_info {
  * Used to retrieve Rx queue information when Tx queue reusing mbufs and moving
  * them into Rx mbuf ring.
  */
-struct rte_eth_recycle_rxq_info {
+struct __rte_cache_min_aligned rte_eth_recycle_rxq_info {
 	struct rte_mbuf **mbuf_ring; /**< mbuf ring of Rx queue. */
 	struct rte_mempool *mp;     /**< mempool of Rx queue. */
 	uint16_t *refill_head;      /**< head of Rx queue refilling mbufs. */
@@ -1884,7 +1884,7 @@ struct rte_eth_recycle_rxq_info {
 	 * Value 0 means that PMD drivers have no requirement for this.
 	 */
 	uint16_t refill_requirement;
-} __rte_cache_min_aligned;
+};
 
 /* Generic Burst mode flag definition, values can be ORed. */
 
diff --git a/lib/ethdev/rte_ethdev_core.h b/lib/ethdev/rte_ethdev_core.h
index a18f242ca4..e55fb42996 100644
--- a/lib/ethdev/rte_ethdev_core.h
+++ b/lib/ethdev/rte_ethdev_core.h
@@ -84,7 +84,7 @@ struct rte_ethdev_qdata {
  * On 64-bit systems contents of this structure occupy exactly two 64B lines.
  * On 32-bit systems contents of this structure fits into one 64B line.
  */
-struct rte_eth_fp_ops {
+struct __rte_cache_aligned rte_eth_fp_ops {
 
 	/**@{*/
 	/**
@@ -124,7 +124,7 @@ struct rte_eth_fp_ops {
 	uintptr_t reserved2[1];
 	/**@}*/
 
-} __rte_cache_aligned;
+};
 
 extern struct rte_eth_fp_ops rte_eth_fp_ops[RTE_MAX_ETHPORTS];
 
diff --git a/lib/ethdev/rte_flow_driver.h b/lib/ethdev/rte_flow_driver.h
index 3c702e30b4..506d1262ab 100644
--- a/lib/ethdev/rte_flow_driver.h
+++ b/lib/ethdev/rte_flow_driver.h
@@ -432,7 +432,7 @@ typedef int (*rte_flow_async_action_list_handle_query_update_t)(
  *
  * Fast path async flow functions are held in a flat array, one entry per ethdev.
  */
-struct rte_flow_fp_ops {
+struct __rte_cache_aligned rte_flow_fp_ops {
 	rte_flow_async_create_t async_create;
 	rte_flow_async_create_by_index_t async_create_by_index;
 	rte_flow_async_actions_update_t async_actions_update;
@@ -447,7 +447,7 @@ struct rte_flow_fp_ops {
 	rte_flow_async_action_list_handle_create_t async_action_list_handle_create;
 	rte_flow_async_action_list_handle_destroy_t async_action_list_handle_destroy;
 	rte_flow_async_action_list_handle_query_update_t async_action_list_handle_query_update;
-} __rte_cache_aligned;
+};
 
 /**
  * @internal
diff --git a/lib/eventdev/event_timer_adapter_pmd.h b/lib/eventdev/event_timer_adapter_pmd.h
index 65b421b9f4..cd5127f047 100644
--- a/lib/eventdev/event_timer_adapter_pmd.h
+++ b/lib/eventdev/event_timer_adapter_pmd.h
@@ -86,7 +86,7 @@ struct event_timer_adapter_ops {
  * @internal Adapter data; structure to be placed in shared memory to be
  * accessible by various processes in a multi-process configuration.
  */
-struct rte_event_timer_adapter_data {
+struct __rte_cache_aligned rte_event_timer_adapter_data {
 	uint8_t id;
 	/**< Event timer adapter ID */
 	uint8_t event_dev_id;
@@ -110,7 +110,7 @@ struct rte_event_timer_adapter_data {
 
 	uint8_t started : 1;
 	/**< Flag to indicate adapter started. */
-} __rte_cache_aligned;
+};
 
 #ifdef __cplusplus
 }
diff --git a/lib/eventdev/eventdev_pmd.h b/lib/eventdev/eventdev_pmd.h
index 2ad880255e..7a5699f14b 100644
--- a/lib/eventdev/eventdev_pmd.h
+++ b/lib/eventdev/eventdev_pmd.h
@@ -105,7 +105,7 @@ struct rte_eventdev_global {
  * This structure is safe to place in shared memory to be common among
  * different processes in a multi-process configuration.
  */
-struct rte_eventdev_data {
+struct __rte_cache_aligned rte_eventdev_data {
 	int socket_id;
 	/**< Socket ID where memory is allocated */
 	uint8_t dev_id;
@@ -144,10 +144,10 @@ struct rte_eventdev_data {
 
 	uint64_t reserved_64s[4]; /**< Reserved for future fields */
 	void *reserved_ptrs[4];	  /**< Reserved for future fields */
-} __rte_cache_aligned;
+};
 
 /** @internal The data structure associated with each event device. */
-struct rte_eventdev {
+struct __rte_cache_aligned rte_eventdev {
 	struct rte_eventdev_data *data;
 	/**< Pointer to device data */
 	struct eventdev_ops *dev_ops;
@@ -187,7 +187,7 @@ struct rte_eventdev {
 
 	uint64_t reserved_64s[3]; /**< Reserved for future fields */
 	void *reserved_ptrs[3];	  /**< Reserved for future fields */
-} __rte_cache_aligned;
+};
 
 extern struct rte_eventdev *rte_eventdevs;
 /** @internal The pool of rte_eventdev structures. */
diff --git a/lib/eventdev/rte_event_crypto_adapter.c b/lib/eventdev/rte_event_crypto_adapter.c
index 9903f96695..db1c7f3906 100644
--- a/lib/eventdev/rte_event_crypto_adapter.c
+++ b/lib/eventdev/rte_event_crypto_adapter.c
@@ -42,7 +42,7 @@
 
 #define ECA_ADAPTER_ARRAY "crypto_adapter_array"
 
-struct crypto_ops_circular_buffer {
+struct __rte_cache_aligned crypto_ops_circular_buffer {
 	/* index of head element in circular buffer */
 	uint16_t head;
 	/* index of tail element in circular buffer */
@@ -53,9 +53,9 @@ struct crypto_ops_circular_buffer {
 	uint16_t size;
 	/* Pointer to hold rte_crypto_ops for batching */
 	struct rte_crypto_op **op_buffer;
-} __rte_cache_aligned;
+};
 
-struct event_crypto_adapter {
+struct __rte_cache_aligned event_crypto_adapter {
 	/* Event device identifier */
 	uint8_t eventdev_id;
 	/* Event port identifier */
@@ -98,10 +98,10 @@ struct event_crypto_adapter {
 	uint16_t nb_qps;
 	/* Adapter mode */
 	enum rte_event_crypto_adapter_mode mode;
-} __rte_cache_aligned;
+};
 
 /* Per crypto device information */
-struct crypto_device_info {
+struct __rte_cache_aligned crypto_device_info {
 	/* Pointer to cryptodev */
 	struct rte_cryptodev *dev;
 	/* Pointer to queue pair info */
@@ -118,15 +118,15 @@ struct crypto_device_info {
 	 * be invoked if not already invoked
 	 */
 	uint16_t num_qpairs;
-} __rte_cache_aligned;
+};
 
 /* Per queue pair information */
-struct crypto_queue_pair_info {
+struct __rte_cache_aligned crypto_queue_pair_info {
 	/* Set to indicate queue pair is enabled */
 	bool qp_enabled;
 	/* Circular buffer for batching crypto ops to cdev */
 	struct crypto_ops_circular_buffer cbuf;
-} __rte_cache_aligned;
+};
 
 static struct event_crypto_adapter **event_crypto_adapter;
 
diff --git a/lib/eventdev/rte_event_dma_adapter.c b/lib/eventdev/rte_event_dma_adapter.c
index 4196164305..24dff556db 100644
--- a/lib/eventdev/rte_event_dma_adapter.c
+++ b/lib/eventdev/rte_event_dma_adapter.c
@@ -26,7 +26,7 @@
 	} while (0)
 
 /* DMA ops circular buffer */
-struct dma_ops_circular_buffer {
+struct __rte_cache_aligned dma_ops_circular_buffer {
 	/* Index of head element */
 	uint16_t head;
 
@@ -41,19 +41,19 @@ struct dma_ops_circular_buffer {
 
 	/* Pointer to hold rte_event_dma_adapter_op for processing */
 	struct rte_event_dma_adapter_op **op_buffer;
-} __rte_cache_aligned;
+};
 
 /* Vchan information */
-struct dma_vchan_info {
+struct __rte_cache_aligned dma_vchan_info {
 	/* Set to indicate vchan queue is enabled */
 	bool vq_enabled;
 
 	/* Circular buffer for batching DMA ops to dma_dev */
 	struct dma_ops_circular_buffer dma_buf;
-} __rte_cache_aligned;
+};
 
 /* DMA device information */
-struct dma_device_info {
+struct __rte_cache_aligned dma_device_info {
 	/* Pointer to vchan queue info */
 	struct dma_vchan_info *vchanq;
 
@@ -81,9 +81,9 @@ struct dma_device_info {
 	 * transfer uses a hardware mechanism
 	 */
 	uint8_t internal_event_port;
-} __rte_cache_aligned;
+};
 
-struct event_dma_adapter {
+struct __rte_cache_aligned event_dma_adapter {
 	/* Event device identifier */
 	uint8_t eventdev_id;
 
@@ -145,7 +145,7 @@ struct event_dma_adapter {
 
 	/* Per instance stats structure */
 	struct rte_event_dma_adapter_stats dma_stats;
-} __rte_cache_aligned;
+};
 
 static struct event_dma_adapter **event_dma_adapter;
 
diff --git a/lib/eventdev/rte_event_eth_rx_adapter.c b/lib/eventdev/rte_event_eth_rx_adapter.c
index 1b83a55b5c..3ee20d95f3 100644
--- a/lib/eventdev/rte_event_eth_rx_adapter.c
+++ b/lib/eventdev/rte_event_eth_rx_adapter.c
@@ -72,7 +72,7 @@ struct eth_rx_poll_entry {
 	uint16_t eth_rx_qid;
 };
 
-struct eth_rx_vector_data {
+struct __rte_cache_aligned eth_rx_vector_data {
 	TAILQ_ENTRY(eth_rx_vector_data) next;
 	uint16_t port;
 	uint16_t queue;
@@ -82,7 +82,7 @@ struct eth_rx_vector_data {
 	uint64_t vector_timeout_ticks;
 	struct rte_mempool *vector_pool;
 	struct rte_event_vector *vector_ev;
-} __rte_cache_aligned;
+};
 
 TAILQ_HEAD(eth_rx_vector_data_list, eth_rx_vector_data);
 
@@ -103,7 +103,7 @@ struct eth_event_enqueue_buffer {
 	uint16_t last_mask;
 };
 
-struct event_eth_rx_adapter {
+struct __rte_cache_aligned event_eth_rx_adapter {
 	/* RSS key */
 	uint8_t rss_key_be[RSS_KEY_SIZE];
 	/* Event device identifier */
@@ -188,7 +188,7 @@ struct event_eth_rx_adapter {
 	uint8_t rxa_started;
 	/* Adapter ID */
 	uint8_t id;
-} __rte_cache_aligned;
+};
 
 /* Per eth device */
 struct eth_device_info {
diff --git a/lib/eventdev/rte_event_eth_tx_adapter.c b/lib/eventdev/rte_event_eth_tx_adapter.c
index 56435be991..67fff8b7d6 100644
--- a/lib/eventdev/rte_event_eth_tx_adapter.c
+++ b/lib/eventdev/rte_event_eth_tx_adapter.c
@@ -109,7 +109,7 @@ struct txa_service_queue_info {
 };
 
 /* PMD private structure */
-struct txa_service_data {
+struct __rte_cache_aligned txa_service_data {
 	/* Max mbufs processed in any service function invocation */
 	uint32_t max_nb_tx;
 	/* Number of Tx queues in adapter */
@@ -144,7 +144,7 @@ struct txa_service_data {
 	int64_t service_id;
 	/* Memory allocation name */
 	char mem_name[TXA_MEM_NAME_LEN];
-} __rte_cache_aligned;
+};
 
 /* Per eth device structure */
 struct txa_service_ethdev {
diff --git a/lib/eventdev/rte_event_timer_adapter.c b/lib/eventdev/rte_event_timer_adapter.c
index fe605e5f46..5886965d14 100644
--- a/lib/eventdev/rte_event_timer_adapter.c
+++ b/lib/eventdev/rte_event_timer_adapter.c
@@ -512,11 +512,11 @@ rte_event_timer_remaining_ticks_get(
 
 #define EXP_TIM_BUF_SZ 128
 
-struct event_buffer {
+struct __rte_cache_aligned event_buffer {
 	size_t head;
 	size_t tail;
 	struct rte_event events[EVENT_BUFFER_SZ];
-} __rte_cache_aligned;
+};
 
 static inline bool
 event_buffer_full(struct event_buffer *bufp)
diff --git a/lib/eventdev/rte_event_timer_adapter.h b/lib/eventdev/rte_event_timer_adapter.h
index c133dec760..0bd1b30045 100644
--- a/lib/eventdev/rte_event_timer_adapter.h
+++ b/lib/eventdev/rte_event_timer_adapter.h
@@ -473,7 +473,7 @@ enum rte_event_timer_state {
  * The generic *rte_event_timer* structure to hold the event timer attributes
  * for arm and cancel operations.
  */
-struct rte_event_timer {
+struct __rte_cache_aligned rte_event_timer {
 	struct rte_event ev;
 	/**<
 	 * Expiry event attributes.  On successful event timer timeout,
@@ -504,7 +504,7 @@ struct rte_event_timer {
 	/**< Memory to store user specific metadata.
 	 * The event timer adapter implementation should not modify this area.
 	 */
-} __rte_cache_aligned;
+};
 
 typedef uint16_t (*rte_event_timer_arm_burst_t)(
 		const struct rte_event_timer_adapter *adapter,
@@ -526,7 +526,7 @@ typedef uint16_t (*rte_event_timer_cancel_burst_t)(
 /**
  * @internal Data structure associated with each event timer adapter.
  */
-struct rte_event_timer_adapter {
+struct __rte_cache_aligned rte_event_timer_adapter {
 	rte_event_timer_arm_burst_t arm_burst;
 	/**< Pointer to driver arm_burst function. */
 	rte_event_timer_arm_tmo_tick_burst_t arm_tmo_tick_burst;
@@ -540,7 +540,7 @@ struct rte_event_timer_adapter {
 
 	uint8_t allocated : 1;
 	/**< Flag to indicate that this adapter has been allocated */
-} __rte_cache_aligned;
+};
 
 #define ADAPTER_VALID_OR_ERR_RET(adapter, retval) do {		\
 	if (adapter == NULL || !adapter->allocated)		\
diff --git a/lib/eventdev/rte_eventdev.h b/lib/eventdev/rte_eventdev.h
index 3af46864df..08e5f9320b 100644
--- a/lib/eventdev/rte_eventdev.h
+++ b/lib/eventdev/rte_eventdev.h
@@ -1338,7 +1338,7 @@ rte_event_dev_close(uint8_t dev_id);
 /**
  * Event vector structure.
  */
-struct rte_event_vector {
+struct __rte_aligned(16) rte_event_vector {
 	uint16_t nb_elem;
 	/**< Number of elements valid in this event vector. */
 	uint16_t elem_offset : 12;
@@ -1376,23 +1376,19 @@ struct rte_event_vector {
 	 * value to share between dequeue and enqueue operation.
 	 * The application should not modify this field.
 	 */
-	union {
+	union __rte_aligned(16) {
 #endif
 		struct rte_mbuf *mbufs[0];
 		void *ptrs[0];
 		uint64_t u64s[0];
 #ifndef __cplusplus
-	} __rte_aligned(16);
+	};
 #endif
 	/**< Start of the vector array union. Depending upon the event type the
 	 * vector array can be an array of mbufs or pointers or opaque u64
 	 * values.
 	 */
-#ifndef __DOXYGEN__
-} __rte_aligned(16);
-#else
 };
-#endif
 
 /* Scheduler type definitions */
 #define RTE_SCHED_TYPE_ORDERED          0
diff --git a/lib/eventdev/rte_eventdev_core.h b/lib/eventdev/rte_eventdev_core.h
index 5b405518d1..fc8e1556ab 100644
--- a/lib/eventdev/rte_eventdev_core.h
+++ b/lib/eventdev/rte_eventdev_core.h
@@ -49,7 +49,7 @@ typedef uint16_t (*event_dma_adapter_enqueue_t)(void *port, struct rte_event ev[
 typedef int (*event_profile_switch_t)(void *port, uint8_t profile);
 /**< @internal Switch active link profile on the event port. */
 
-struct rte_event_fp_ops {
+struct __rte_cache_aligned rte_event_fp_ops {
 	void **data;
 	/**< points to array of internal port data pointers */
 	event_enqueue_t enqueue;
@@ -77,7 +77,7 @@ struct rte_event_fp_ops {
 	event_profile_switch_t profile_switch;
 	/**< PMD Event switch profile function. */
 	uintptr_t reserved[4];
-} __rte_cache_aligned;
+};
 
 extern struct rte_event_fp_ops rte_event_fp_ops[RTE_EVENT_MAX_DEVS];
 
diff --git a/lib/gpudev/gpudev_driver.h b/lib/gpudev/gpudev_driver.h
index 0b1e7f2172..37b6ae3149 100644
--- a/lib/gpudev/gpudev_driver.h
+++ b/lib/gpudev/gpudev_driver.h
@@ -72,7 +72,7 @@ struct rte_gpu_mpshared {
 	RTE_ATOMIC(uint16_t) process_refcnt; /* Updated by this library. */
 };
 
-struct rte_gpu {
+struct __rte_cache_aligned rte_gpu {
 	/* Backing device. */
 	struct rte_device *device;
 	/* Data shared between processes. */
@@ -85,7 +85,7 @@ struct rte_gpu {
 	enum rte_gpu_state process_state; /* Updated by this library. */
 	/* Driver-specific private data for the running process. */
 	void *process_private;
-} __rte_cache_aligned;
+};
 
 __rte_internal
 struct rte_gpu *rte_gpu_get_by_name(const char *name);
diff --git a/lib/graph/graph_private.h b/lib/graph/graph_private.h
index 04538eab86..d557d55f2d 100644
--- a/lib/graph/graph_private.h
+++ b/lib/graph/graph_private.h
@@ -70,11 +70,11 @@ struct node {
  * Structure that holds the graph scheduling workqueue node stream.
  * Used for mcore dispatch model.
  */
-struct graph_mcore_dispatch_wq_node {
+struct __rte_cache_aligned graph_mcore_dispatch_wq_node {
 	rte_graph_off_t node_off;
 	uint16_t nb_objs;
 	void *objs[RTE_GRAPH_BURST_SIZE];
-} __rte_cache_aligned;
+};
 
 /**
  * @internal
diff --git a/lib/graph/graph_stats.c b/lib/graph/graph_stats.c
index cc32245c05..2fb808b21e 100644
--- a/lib/graph/graph_stats.c
+++ b/lib/graph/graph_stats.c
@@ -28,7 +28,7 @@ struct cluster_node {
 	struct rte_node *nodes[];
 };
 
-struct rte_graph_cluster_stats {
+struct __rte_cache_aligned rte_graph_cluster_stats {
 	/* Header */
 	rte_graph_cluster_stats_cb_t fn;
 	uint32_t cluster_node_size; /* Size of struct cluster_node */
@@ -38,7 +38,7 @@ struct rte_graph_cluster_stats {
 	size_t sz;
 
 	struct cluster_node clusters[];
-} __rte_cache_aligned;
+};
 
 #define boarder_model_dispatch()                                                              \
 	fprintf(f, "+-------------------------------+---------------+--------" \
diff --git a/lib/graph/rte_graph.h b/lib/graph/rte_graph.h
index 2d37d5e0d6..ecfec2068a 100644
--- a/lib/graph/rte_graph.h
+++ b/lib/graph/rte_graph.h
@@ -200,7 +200,7 @@ struct rte_graph_cluster_stats_param {
  *
  * @see struct rte_graph_cluster_stats_param::fn
  */
-struct rte_graph_cluster_node_stats {
+struct __rte_cache_aligned rte_graph_cluster_node_stats {
 	uint64_t ts;	    /**< Current timestamp. */
 	uint64_t calls;	    /**< Current number of calls made. */
 	uint64_t objs;      /**< Current number of objs processed. */
@@ -225,7 +225,7 @@ struct rte_graph_cluster_node_stats {
 	rte_node_t id;	/**< Node identifier of stats. */
 	uint64_t hz;	/**< Cycles per seconds. */
 	char name[RTE_NODE_NAMESIZE];	/**< Name of the node. */
-} __rte_cache_aligned;
+};
 
 /**
  * Create Graph.
diff --git a/lib/graph/rte_graph_worker_common.h b/lib/graph/rte_graph_worker_common.h
index 4efc461f8a..36d864e2c1 100644
--- a/lib/graph/rte_graph_worker_common.h
+++ b/lib/graph/rte_graph_worker_common.h
@@ -45,7 +45,7 @@ SLIST_HEAD(rte_graph_rq_head, rte_graph);
  *
  * Data structure to hold graph data.
  */
-struct rte_graph {
+struct __rte_cache_aligned rte_graph {
 	/* Fast path area. */
 	uint32_t tail;		     /**< Tail of circular buffer. */
 	uint32_t head;		     /**< Head of circular buffer. */
@@ -80,14 +80,14 @@ struct rte_graph {
 	uint64_t nb_pkt_to_capture;
 	char pcap_filename[RTE_GRAPH_PCAP_FILE_SZ];  /**< Pcap filename. */
 	uint64_t fence;			/**< Fence. */
-} __rte_cache_aligned;
+};
 
 /**
  * @internal
  *
  * Data structure to hold node data.
  */
-struct rte_node {
+struct __rte_cache_aligned rte_node {
 	/* Slow path area  */
 	uint64_t fence;		/**< Fence. */
 	rte_graph_off_t next;	/**< Index to next node. */
@@ -128,7 +128,7 @@ struct rte_node {
 			uint64_t process_u64;
 		};
 	alignas(RTE_CACHE_LINE_MIN_SIZE) struct rte_node *nodes[]; /**< Next nodes. */
-} __rte_cache_aligned;
+};
 
 /**
  * @internal
diff --git a/lib/hash/rte_cuckoo_hash.h b/lib/hash/rte_cuckoo_hash.h
index b2ccc5f669..a528f1d1a0 100644
--- a/lib/hash/rte_cuckoo_hash.h
+++ b/lib/hash/rte_cuckoo_hash.h
@@ -119,10 +119,10 @@ const rte_hash_cmp_eq_t cmp_jump_table[NUM_KEY_CMP_CASES] = {
 
 #define RTE_HASH_TSX_MAX_RETRY  10
 
-struct lcore_cache {
+struct __rte_cache_aligned lcore_cache {
 	unsigned len; /**< Cache len */
 	uint32_t objs[LCORE_CACHE_SIZE]; /**< Cache objects */
-} __rte_cache_aligned;
+};
 
 /* Structure that stores key-value pair */
 struct rte_hash_key {
@@ -143,7 +143,7 @@ enum rte_hash_sig_compare_function {
 };
 
 /** Bucket structure */
-struct rte_hash_bucket {
+struct __rte_cache_aligned rte_hash_bucket {
 	uint16_t sig_current[RTE_HASH_BUCKET_ENTRIES];
 
 	RTE_ATOMIC(uint32_t) key_idx[RTE_HASH_BUCKET_ENTRIES];
@@ -151,10 +151,10 @@ struct rte_hash_bucket {
 	uint8_t flag[RTE_HASH_BUCKET_ENTRIES];
 
 	void *next;
-} __rte_cache_aligned;
+};
 
 /** A hash table structure. */
-struct rte_hash {
+struct __rte_cache_aligned rte_hash {
 	char name[RTE_HASH_NAMESIZE];   /**< Name of the hash. */
 	uint32_t entries;               /**< Total table entries. */
 	uint32_t num_buckets;           /**< Number of buckets in table. */
@@ -222,7 +222,7 @@ struct rte_hash {
 	uint32_t *ext_bkt_to_free;
 	RTE_ATOMIC(uint32_t) *tbl_chng_cnt;
 	/**< Indicates if the hash table changed from last read. */
-} __rte_cache_aligned;
+};
 
 struct queue_node {
 	struct rte_hash_bucket *bkt; /* Current bucket on the bfs search */
diff --git a/lib/hash/rte_thash.h b/lib/hash/rte_thash.h
index 2681b1b5b3..30b657e67a 100644
--- a/lib/hash/rte_thash.h
+++ b/lib/hash/rte_thash.h
@@ -99,14 +99,14 @@ struct rte_ipv6_tuple {
 	};
 };
 
+#ifdef RTE_ARCH_X86
+union __rte_aligned(XMM_SIZE) rte_thash_tuple {
+#else
 union rte_thash_tuple {
+#endif
 	struct rte_ipv4_tuple	v4;
 	struct rte_ipv6_tuple	v6;
-#ifdef RTE_ARCH_X86
-} __rte_aligned(XMM_SIZE);
-#else
 };
-#endif
 
 /**
  * Prepare special converted key to use with rte_softrss_be()
diff --git a/lib/ip_frag/ip_reassembly.h b/lib/ip_frag/ip_reassembly.h
index a9f97aefca..5443c738a6 100644
--- a/lib/ip_frag/ip_reassembly.h
+++ b/lib/ip_frag/ip_reassembly.h
@@ -47,7 +47,7 @@ struct ip_frag_key {
  * Fragmented packet to reassemble.
  * First two entries in the frags[] array are for the last and first fragments.
  */
-struct ip_frag_pkt {
+struct __rte_cache_aligned ip_frag_pkt {
 	RTE_TAILQ_ENTRY(ip_frag_pkt) lru;      /* LRU list */
 	struct ip_frag_key key;                /* fragmentation key */
 	uint64_t start;                        /* creation timestamp */
@@ -55,20 +55,20 @@ struct ip_frag_pkt {
 	uint32_t frag_size;                    /* size of fragments received */
 	uint32_t last_idx;                     /* index of next entry to fill */
 	struct ip_frag frags[IP_MAX_FRAG_NUM]; /* fragments */
-} __rte_cache_aligned;
+};
 
  /* fragments tailq */
 RTE_TAILQ_HEAD(ip_pkt_list, ip_frag_pkt);
 
 /* fragmentation table statistics */
-struct ip_frag_tbl_stat {
+struct __rte_cache_aligned ip_frag_tbl_stat {
 	uint64_t find_num;     /* total # of find/insert attempts. */
 	uint64_t add_num;      /* # of add ops. */
 	uint64_t del_num;      /* # of del ops. */
 	uint64_t reuse_num;    /* # of reuse (del/add) ops. */
 	uint64_t fail_total;   /* total # of add failures. */
 	uint64_t fail_nospace; /* # of 'no space' add failures. */
-} __rte_cache_aligned;
+};
 
 /* fragmentation table */
 struct rte_ip_frag_tbl {
diff --git a/lib/ipsec/rte_ipsec.h b/lib/ipsec/rte_ipsec.h
index 44cecabeb8..f15f6f2966 100644
--- a/lib/ipsec/rte_ipsec.h
+++ b/lib/ipsec/rte_ipsec.h
@@ -55,7 +55,7 @@ struct rte_ipsec_sa_pkt_func {
  * - pointer to security/crypto session, plus other related data
  * - session/device specific functions to prepare/process IPsec packets.
  */
-struct rte_ipsec_session {
+struct __rte_cache_aligned rte_ipsec_session {
 	/**
 	 * SA that session belongs to.
 	 * Note that multiple sessions can belong to the same SA.
@@ -77,7 +77,7 @@ struct rte_ipsec_session {
 	};
 	/** functions to prepare/process IPsec packets */
 	struct rte_ipsec_sa_pkt_func pkt_func;
-} __rte_cache_aligned;
+};
 
 /**
  * Checks that inside given rte_ipsec_session crypto/security fields
diff --git a/lib/ipsec/sa.h b/lib/ipsec/sa.h
index 4b30bea593..2560d33d84 100644
--- a/lib/ipsec/sa.h
+++ b/lib/ipsec/sa.h
@@ -75,7 +75,7 @@ enum sa_algo_type	{
 	ALGO_TYPE_MAX
 };
 
-struct rte_ipsec_sa {
+struct __rte_cache_aligned rte_ipsec_sa {
 
 	uint64_t type;     /* type of given SA */
 	uint64_t udata;    /* user defined */
@@ -141,7 +141,7 @@ struct rte_ipsec_sa {
 		} errors;
 	} statistics;
 
-} __rte_cache_aligned;
+};
 
 int
 ipsec_sa_pkt_func_select(const struct rte_ipsec_session *ss,
diff --git a/lib/jobstats/rte_jobstats.h b/lib/jobstats/rte_jobstats.h
index 45b460e603..bdd85fe000 100644
--- a/lib/jobstats/rte_jobstats.h
+++ b/lib/jobstats/rte_jobstats.h
@@ -32,7 +32,7 @@ struct rte_jobstats;
 typedef void (*rte_job_update_period_cb_t)(struct rte_jobstats *job,
 		int64_t job_result);
 
-struct rte_jobstats {
+struct __rte_cache_aligned rte_jobstats {
 	uint64_t period;
 	/**< Estimated period of execution. */
 
@@ -65,9 +65,9 @@ struct rte_jobstats {
 
 	struct rte_jobstats_context *context;
 	/**< Job stats context object that is executing this job. */
-} __rte_cache_aligned;
+};
 
-struct rte_jobstats_context {
+struct __rte_cache_aligned rte_jobstats_context {
 	/** Variable holding time at different points:
 	 * -# loop start time if loop was started but no job executed yet.
 	 * -# job start time if job is currently executing.
@@ -111,7 +111,7 @@ struct rte_jobstats_context {
 
 	uint64_t loop_cnt;
 	/**< Total count of executed loops with at least one executed job. */
-} __rte_cache_aligned;
+};
 
 /**
  * Initialize given context object with default values.
diff --git a/lib/mbuf/rte_mbuf_core.h b/lib/mbuf/rte_mbuf_core.h
index fb6d5b5a1e..9f580769cf 100644
--- a/lib/mbuf/rte_mbuf_core.h
+++ b/lib/mbuf/rte_mbuf_core.h
@@ -464,7 +464,7 @@ enum {
 /**
  * The generic rte_mbuf, containing a packet mbuf.
  */
-struct rte_mbuf {
+struct __rte_cache_aligned rte_mbuf {
 	RTE_MARKER cacheline0;
 
 	void *buf_addr;           /**< Virtual address of segment buffer. */
@@ -663,7 +663,7 @@ struct rte_mbuf {
 	uint16_t timesync;
 
 	uint32_t dynfield1[9]; /**< Reserved for dynamic fields. */
-} __rte_cache_aligned;
+};
 
 /**
  * Function typedef of callback to free externally attached buffer.
diff --git a/lib/member/rte_member.h b/lib/member/rte_member.h
index 3278bbb5c1..aec192eba5 100644
--- a/lib/member/rte_member.h
+++ b/lib/member/rte_member.h
@@ -139,7 +139,7 @@ typedef void (*sketch_delete_fn_t)(const struct rte_member_setsum *ss,
 				   const void *key);
 
 /** @internal setsummary structure. */
-struct rte_member_setsum {
+struct __rte_cache_aligned rte_member_setsum {
 	enum rte_member_setsum_type type; /* Type of the set summary. */
 	uint32_t key_len;		/* Length of key. */
 	uint32_t prim_hash_seed;	/* Primary hash function seed. */
@@ -185,14 +185,14 @@ struct rte_member_setsum {
 #ifdef RTE_ARCH_X86
 	bool use_avx512;
 #endif
-} __rte_cache_aligned;
+};
 
 /**
  * Parameters used when create the set summary table. Currently user can
  * specify two types of setsummary: HT based and vBF. For HT based, user can
  * specify cache or non-cache mode. Here is a table to describe some differences
  */
-struct rte_member_parameters {
+struct __rte_cache_aligned rte_member_parameters {
 	const char *name;			/**< Name of the hash. */
 
 	/**
@@ -326,7 +326,7 @@ struct rte_member_parameters {
 	uint32_t extra_flag;
 
 	int socket_id;			/**< NUMA Socket ID for memory. */
-} __rte_cache_aligned;
+};
 
 /**
  * Find an existing set-summary and return a pointer to it.
diff --git a/lib/member/rte_member_ht.h b/lib/member/rte_member_ht.h
index 9e24ccdc24..c9673e3356 100644
--- a/lib/member/rte_member_ht.h
+++ b/lib/member/rte_member_ht.h
@@ -15,10 +15,10 @@ extern "C" {
 typedef uint16_t member_sig_t;			/* signature size is 16 bit */
 
 /* The bucket struct for ht setsum */
-struct member_ht_bucket {
+struct __rte_cache_aligned member_ht_bucket {
 	member_sig_t sigs[RTE_MEMBER_BUCKET_ENTRIES];	/* 2-byte signature */
 	member_set_t sets[RTE_MEMBER_BUCKET_ENTRIES];	/* 2-byte set */
-} __rte_cache_aligned;
+};
 
 int
 rte_member_create_ht(struct rte_member_setsum *ss,
diff --git a/lib/member/rte_member_sketch.c b/lib/member/rte_member_sketch.c
index e006e835d9..15af6786e9 100644
--- a/lib/member/rte_member_sketch.c
+++ b/lib/member/rte_member_sketch.c
@@ -23,7 +23,7 @@
 #include "rte_member_sketch_avx512.h"
 #endif /* CC_AVX512_SUPPORT */
 
-struct sketch_runtime {
+struct __rte_cache_aligned sketch_runtime {
 	uint64_t pkt_cnt;
 	uint32_t until_next;
 	int converged;
@@ -31,7 +31,7 @@ struct sketch_runtime {
 	struct node *report_array;
 	void *key_slots;
 	struct rte_ring *free_key_slots;
-} __rte_cache_aligned;
+};
 
 /*
  * Geometric sampling to calculate how many packets needs to be
diff --git a/lib/mempool/rte_mempool.h b/lib/mempool/rte_mempool.h
index 0615b85e33..23fd5c8465 100644
--- a/lib/mempool/rte_mempool.h
+++ b/lib/mempool/rte_mempool.h
@@ -67,7 +67,7 @@ extern "C" {
  * captured since they can be calculated from other stats.
  * For example: put_cache_objs = put_objs - put_common_pool_objs.
  */
-struct rte_mempool_debug_stats {
+struct __rte_cache_aligned rte_mempool_debug_stats {
 	uint64_t put_bulk;             /**< Number of puts. */
 	uint64_t put_objs;             /**< Number of objects successfully put. */
 	uint64_t put_common_pool_bulk; /**< Number of bulks enqueued in common pool. */
@@ -81,13 +81,13 @@ struct rte_mempool_debug_stats {
 	uint64_t get_success_blks;     /**< Successful allocation number of contiguous blocks. */
 	uint64_t get_fail_blks;        /**< Failed allocation number of contiguous blocks. */
 	RTE_CACHE_GUARD;
-} __rte_cache_aligned;
+};
 #endif
 
 /**
  * A structure that stores a per-core object cache.
  */
-struct rte_mempool_cache {
+struct __rte_cache_aligned rte_mempool_cache {
 	uint32_t size;	      /**< Size of the cache */
 	uint32_t flushthresh; /**< Threshold before we flush excess elements */
 	uint32_t len;	      /**< Current cache count */
@@ -111,7 +111,7 @@ struct rte_mempool_cache {
 	 * cases to avoid needless emptying of cache.
 	 */
 	alignas(RTE_CACHE_LINE_SIZE) void *objs[RTE_MEMPOOL_CACHE_MAX_SIZE * 2];
-} __rte_cache_aligned;
+};
 
 /**
  * A structure that stores the size of mempool elements.
@@ -219,15 +219,15 @@ struct rte_mempool_memhdr {
  * The structure is cache-line aligned to avoid ABI breakages in
  * a number of cases when something small is added.
  */
-struct rte_mempool_info {
+struct __rte_cache_aligned rte_mempool_info {
 	/** Number of objects in the contiguous block */
 	unsigned int contig_block_size;
-} __rte_cache_aligned;
+};
 
 /**
  * The RTE mempool structure.
  */
-struct rte_mempool {
+struct __rte_cache_aligned rte_mempool {
 	char name[RTE_MEMPOOL_NAMESIZE]; /**< Name of mempool. */
 	union {
 		void *pool_data;         /**< Ring or pool to store objects. */
@@ -269,7 +269,7 @@ struct rte_mempool {
 	 */
 	struct rte_mempool_debug_stats stats[RTE_MAX_LCORE + 1];
 #endif
-}  __rte_cache_aligned;
+};
 
 /** Spreading among memory channels not required. */
 #define RTE_MEMPOOL_F_NO_SPREAD		0x0001
@@ -689,7 +689,7 @@ typedef int (*rte_mempool_get_info_t)(const struct rte_mempool *mp,
 
 
 /** Structure defining mempool operations structure */
-struct rte_mempool_ops {
+struct __rte_cache_aligned rte_mempool_ops {
 	char name[RTE_MEMPOOL_OPS_NAMESIZE]; /**< Name of mempool ops struct. */
 	rte_mempool_alloc_t alloc;       /**< Allocate private data. */
 	rte_mempool_free_t free;         /**< Free the external pool. */
@@ -714,7 +714,7 @@ struct rte_mempool_ops {
 	 * Dequeue a number of contiguous object blocks.
 	 */
 	rte_mempool_dequeue_contig_blocks_t dequeue_contig_blocks;
-} __rte_cache_aligned;
+};
 
 #define RTE_MEMPOOL_MAX_OPS_IDX 16  /**< Max registered ops structs */
 
@@ -727,14 +727,14 @@ struct rte_mempool_ops {
  * any function pointers stored directly in the mempool struct would not be.
  * This results in us simply having "ops_index" in the mempool struct.
  */
-struct rte_mempool_ops_table {
+struct __rte_cache_aligned rte_mempool_ops_table {
 	rte_spinlock_t sl;     /**< Spinlock for add/delete. */
 	uint32_t num_ops;      /**< Number of used ops structs in the table. */
 	/**
 	 * Storage for all possible ops structs.
 	 */
 	struct rte_mempool_ops ops[RTE_MEMPOOL_MAX_OPS_IDX];
-} __rte_cache_aligned;
+};
 
 /** Array of registered ops structs. */
 extern struct rte_mempool_ops_table rte_mempool_ops_table;
diff --git a/lib/mldev/rte_mldev.h b/lib/mldev/rte_mldev.h
index 01577bd8bd..634af3d5e1 100644
--- a/lib/mldev/rte_mldev.h
+++ b/lib/mldev/rte_mldev.h
@@ -421,7 +421,7 @@ struct rte_ml_buff_seg {
  * This structure contains data related to performing an ML operation on the buffers using
  * the model specified through model_id.
  */
-struct rte_ml_op {
+struct __rte_cache_aligned rte_ml_op {
 	uint16_t model_id;
 	/**< Model ID to be used for the operation. */
 	uint16_t nb_batches;
@@ -469,7 +469,7 @@ struct rte_ml_op {
 	 * dequeue and enqueue operation.
 	 * The application should not modify this field.
 	 */
-} __rte_cache_aligned;
+};
 
 /* Enqueue/Dequeue operations */
 
diff --git a/lib/mldev/rte_mldev_core.h b/lib/mldev/rte_mldev_core.h
index 2279b1dcec..b3bd281083 100644
--- a/lib/mldev/rte_mldev_core.h
+++ b/lib/mldev/rte_mldev_core.h
@@ -626,7 +626,7 @@ struct rte_ml_dev_data {
  *
  * The data structure associated with each ML device.
  */
-struct rte_ml_dev {
+struct __rte_cache_aligned rte_ml_dev {
 	/** Pointer to PMD enqueue function. */
 	mldev_enqueue_t enqueue_burst;
 
@@ -647,7 +647,7 @@ struct rte_ml_dev {
 
 	/** Flag indicating the device is attached. */
 	__extension__ uint8_t attached : 1;
-} __rte_cache_aligned;
+};
 
 /**
  * @internal
diff --git a/lib/net/rte_arp.h b/lib/net/rte_arp.h
index c3cd0afb5c..668cea1704 100644
--- a/lib/net/rte_arp.h
+++ b/lib/net/rte_arp.h
@@ -21,17 +21,17 @@ extern "C" {
 /**
  * ARP header IPv4 payload.
  */
-struct rte_arp_ipv4 {
+struct __rte_aligned(2) rte_arp_ipv4 {
 	struct rte_ether_addr arp_sha;  /**< sender hardware address */
 	rte_be32_t            arp_sip;  /**< sender IP address */
 	struct rte_ether_addr arp_tha;  /**< target hardware address */
 	rte_be32_t            arp_tip;  /**< target IP address */
-} __rte_packed __rte_aligned(2);
+} __rte_packed;
 
 /**
  * ARP header.
  */
-struct rte_arp_hdr {
+struct __rte_aligned(2) rte_arp_hdr {
 	rte_be16_t arp_hardware; /**< format of hardware address */
 #define RTE_ARP_HRD_ETHER     1  /**< ARP Ethernet address format */
 
@@ -47,7 +47,7 @@ struct rte_arp_hdr {
 #define	RTE_ARP_OP_INVREPLY   9  /**< response identifying peer */
 
 	struct rte_arp_ipv4 arp_data;
-} __rte_packed __rte_aligned(2);
+} __rte_packed;
 
 /**
  * Make a RARP packet based on MAC addr.
diff --git a/lib/net/rte_ether.h b/lib/net/rte_ether.h
index 75285bdd12..32ed515aef 100644
--- a/lib/net/rte_ether.h
+++ b/lib/net/rte_ether.h
@@ -71,9 +71,9 @@ extern "C" {
  * administrator and does not contain OUIs.
  * See http://standards.ieee.org/regauth/groupmac/tutorial.html
  */
-struct rte_ether_addr {
+struct __rte_aligned(2) rte_ether_addr {
 	uint8_t addr_bytes[RTE_ETHER_ADDR_LEN]; /**< Addr bytes in tx order */
-} __rte_aligned(2);
+};
 
 #define RTE_ETHER_LOCAL_ADMIN_ADDR 0x02 /**< Locally assigned Eth. address. */
 #define RTE_ETHER_GROUP_ADDR  0x01 /**< Multicast or broadcast Eth. address. */
@@ -290,11 +290,11 @@ rte_ether_unformat_addr(const char *str, struct rte_ether_addr *eth_addr);
  * Ethernet header: Contains the destination address, source address
  * and frame type.
  */
-struct rte_ether_hdr {
+struct __rte_aligned(2) rte_ether_hdr {
 	struct rte_ether_addr dst_addr; /**< Destination address. */
 	struct rte_ether_addr src_addr; /**< Source address. */
 	rte_be16_t ether_type; /**< Frame type. */
-} __rte_aligned(2);
+};
 
 /**
  * Ethernet VLAN Header.
diff --git a/lib/node/node_private.h b/lib/node/node_private.h
index 64843f7d70..1de7306792 100644
--- a/lib/node/node_private.h
+++ b/lib/node/node_private.h
@@ -50,9 +50,9 @@ extern int node_mbuf_priv1_dynfield_offset;
 /**
  * Node mbuf private area 2.
  */
-struct node_mbuf_priv2 {
+struct __rte_cache_aligned node_mbuf_priv2 {
 	uint64_t priv_data;
-} __rte_cache_aligned;
+};
 
 #define NODE_MBUF_PRIV2_SIZE sizeof(struct node_mbuf_priv2)
 
diff --git a/lib/pdcp/rte_pdcp.h b/lib/pdcp/rte_pdcp.h
index dd8b6e4d34..f74524f83d 100644
--- a/lib/pdcp/rte_pdcp.h
+++ b/lib/pdcp/rte_pdcp.h
@@ -49,7 +49,7 @@ typedef uint16_t (*rte_pdcp_post_p_t)(const struct rte_pdcp_entity *entity,
  * A PDCP entity is associated either to the control plane or the user plane
  * depending on which radio bearer it is carrying data for.
  */
-struct rte_pdcp_entity {
+struct __rte_cache_aligned rte_pdcp_entity {
 	/** Entity specific pre-process handle. */
 	rte_pdcp_pre_p_t pre_process;
 	/** Entity specific post-process handle. */
@@ -66,7 +66,7 @@ struct rte_pdcp_entity {
 	 * hold additionally 'max_pkt_cache' number of packets.
 	 */
 	uint32_t max_pkt_cache;
-} __rte_cache_aligned;
+};
 
 /**
  * Callback function type for t-Reordering timer start, set during PDCP entity establish.
diff --git a/lib/pipeline/rte_pipeline.c b/lib/pipeline/rte_pipeline.c
index 945bb025a1..a09a89f746 100644
--- a/lib/pipeline/rte_pipeline.c
+++ b/lib/pipeline/rte_pipeline.c
@@ -104,7 +104,7 @@ struct rte_table {
 
 #define RTE_PIPELINE_MAX_NAME_SZ                           124
 
-struct rte_pipeline {
+struct __rte_cache_aligned rte_pipeline {
 	/* Input parameters */
 	char name[RTE_PIPELINE_MAX_NAME_SZ];
 	int socket_id;
@@ -132,7 +132,7 @@ struct rte_pipeline {
 	uint64_t pkts_mask;
 	uint64_t n_pkts_ah_drop;
 	uint64_t pkts_drop_mask;
-} __rte_cache_aligned;
+};
 
 static inline uint32_t
 rte_mask_get_next(uint64_t mask, uint32_t pos)
diff --git a/lib/pipeline/rte_table_action.c b/lib/pipeline/rte_table_action.c
index dfdbc66b08..87c3e0e2c9 100644
--- a/lib/pipeline/rte_table_action.c
+++ b/lib/pipeline/rte_table_action.c
@@ -465,11 +465,11 @@ struct encap_qinq_data {
 	((((uint64_t)(s)) & 0x1LLU) << 8) |                \
 	(((uint64_t)(ttl)) & 0xFFLLU)))
 
-struct encap_mpls_data {
+struct __rte_aligned(2) encap_mpls_data {
 	struct rte_ether_hdr ether;
 	uint32_t mpls[RTE_TABLE_ACTION_MPLS_LABELS_MAX];
 	uint32_t mpls_count;
-} __rte_packed __rte_aligned(2);
+} __rte_packed;
 
 #define PPP_PROTOCOL_IP                                    0x0021
 
@@ -487,42 +487,42 @@ struct encap_pppoe_data {
 
 #define IP_PROTO_UDP                                       17
 
-struct encap_vxlan_ipv4_data {
+struct __rte_aligned(2) encap_vxlan_ipv4_data {
 	struct rte_ether_hdr ether;
 	struct rte_ipv4_hdr ipv4;
 	struct rte_udp_hdr udp;
 	struct rte_vxlan_hdr vxlan;
-} __rte_packed __rte_aligned(2);
+} __rte_packed;
 
-struct encap_vxlan_ipv4_vlan_data {
+struct __rte_aligned(2) encap_vxlan_ipv4_vlan_data {
 	struct rte_ether_hdr ether;
 	struct rte_vlan_hdr vlan;
 	struct rte_ipv4_hdr ipv4;
 	struct rte_udp_hdr udp;
 	struct rte_vxlan_hdr vxlan;
-} __rte_packed __rte_aligned(2);
+} __rte_packed;
 
-struct encap_vxlan_ipv6_data {
+struct __rte_aligned(2) encap_vxlan_ipv6_data {
 	struct rte_ether_hdr ether;
 	struct rte_ipv6_hdr ipv6;
 	struct rte_udp_hdr udp;
 	struct rte_vxlan_hdr vxlan;
-} __rte_packed __rte_aligned(2);
+} __rte_packed;
 
-struct encap_vxlan_ipv6_vlan_data {
+struct __rte_aligned(2) encap_vxlan_ipv6_vlan_data {
 	struct rte_ether_hdr ether;
 	struct rte_vlan_hdr vlan;
 	struct rte_ipv6_hdr ipv6;
 	struct rte_udp_hdr udp;
 	struct rte_vxlan_hdr vxlan;
-} __rte_packed __rte_aligned(2);
+} __rte_packed;
 
-struct encap_qinq_pppoe_data {
+struct __rte_aligned(2) encap_qinq_pppoe_data {
 	struct rte_ether_hdr ether;
 	struct rte_vlan_hdr svlan;
 	struct rte_vlan_hdr cvlan;
 	struct pppoe_ppp_hdr pppoe_ppp;
-} __rte_packed __rte_aligned(2);
+} __rte_packed;
 
 static size_t
 encap_data_size(struct rte_table_action_encap_config *encap)
diff --git a/lib/port/rte_port_frag.c b/lib/port/rte_port_frag.c
index 883601a9ae..0940f945bd 100644
--- a/lib/port/rte_port_frag.c
+++ b/lib/port/rte_port_frag.c
@@ -34,7 +34,7 @@ typedef int32_t
 			struct rte_mempool *pool_direct,
 			struct rte_mempool *pool_indirect);
 
-struct rte_port_ring_reader_frag {
+struct __rte_cache_aligned rte_port_ring_reader_frag {
 	struct rte_port_in_stats stats;
 
 	/* Input parameters */
@@ -53,7 +53,7 @@ struct rte_port_ring_reader_frag {
 	uint32_t pos_frags;
 
 	frag_op f_frag;
-} __rte_cache_aligned;
+};
 
 static void *
 rte_port_ring_reader_frag_create(void *params, int socket_id, int is_ipv4)
diff --git a/lib/power/power_acpi_cpufreq.c b/lib/power/power_acpi_cpufreq.c
index f8d978d03d..81996e1c13 100644
--- a/lib/power/power_acpi_cpufreq.c
+++ b/lib/power/power_acpi_cpufreq.c
@@ -41,7 +41,7 @@ enum power_state {
 /**
  * Power info per lcore.
  */
-struct acpi_power_info {
+struct __rte_cache_aligned acpi_power_info {
 	unsigned int lcore_id;               /**< Logical core id */
 	uint32_t freqs[RTE_MAX_LCORE_FREQS]; /**< Frequency array */
 	uint32_t nb_freqs;                   /**< number of available freqs */
@@ -51,7 +51,7 @@ struct acpi_power_info {
 	RTE_ATOMIC(uint32_t) state;          /**< Power in use state */
 	uint16_t turbo_available;            /**< Turbo Boost available */
 	uint16_t turbo_enable;               /**< Turbo Boost enable/disable */
-} __rte_cache_aligned;
+};
 
 static struct acpi_power_info lcore_power_info[RTE_MAX_LCORE];
 
diff --git a/lib/power/power_amd_pstate_cpufreq.c b/lib/power/power_amd_pstate_cpufreq.c
index 028f84416b..090a0d96cb 100644
--- a/lib/power/power_amd_pstate_cpufreq.c
+++ b/lib/power/power_amd_pstate_cpufreq.c
@@ -45,7 +45,7 @@ enum power_state {
 /**
  * Power info per lcore.
  */
-struct amd_pstate_power_info {
+struct __rte_cache_aligned amd_pstate_power_info {
 	uint32_t lcore_id;                   /**< Logical core id */
 	RTE_ATOMIC(uint32_t) state;          /**< Power in use state */
 	FILE *f;                             /**< FD of scaling_setspeed */
@@ -58,7 +58,7 @@ struct amd_pstate_power_info {
 	uint16_t turbo_enable;               /**< Turbo Boost enable/disable */
 	uint32_t nb_freqs;                   /**< number of available freqs */
 	uint32_t freqs[RTE_MAX_LCORE_FREQS]; /**< Frequency array */
-} __rte_cache_aligned;
+};
 
 static struct amd_pstate_power_info lcore_power_info[RTE_MAX_LCORE];
 
diff --git a/lib/power/power_cppc_cpufreq.c b/lib/power/power_cppc_cpufreq.c
index 3ddf39bd76..32aaacb948 100644
--- a/lib/power/power_cppc_cpufreq.c
+++ b/lib/power/power_cppc_cpufreq.c
@@ -49,7 +49,7 @@ enum power_state {
 /**
  * Power info per lcore.
  */
-struct cppc_power_info {
+struct __rte_cache_aligned cppc_power_info {
 	unsigned int lcore_id;               /**< Logical core id */
 	RTE_ATOMIC(uint32_t) state;          /**< Power in use state */
 	FILE *f;                             /**< FD of scaling_setspeed */
@@ -61,7 +61,7 @@ struct cppc_power_info {
 	uint16_t turbo_enable;               /**< Turbo Boost enable/disable */
 	uint32_t nb_freqs;                   /**< number of available freqs */
 	uint32_t freqs[RTE_MAX_LCORE_FREQS]; /**< Frequency array */
-} __rte_cache_aligned;
+};
 
 static struct cppc_power_info lcore_power_info[RTE_MAX_LCORE];
 
diff --git a/lib/power/power_intel_uncore.c b/lib/power/power_intel_uncore.c
index 3ce8fccec2..9c152e4ed2 100644
--- a/lib/power/power_intel_uncore.c
+++ b/lib/power/power_intel_uncore.c
@@ -29,7 +29,7 @@
 		"/sys/devices/system/cpu/intel_uncore_frequency/package_%02u_die_%02u/initial_min_freq_khz"
 
 
-struct uncore_power_info {
+struct __rte_cache_aligned uncore_power_info {
 	unsigned int die;                  /* Core die id */
 	unsigned int pkg;                  /* Package id */
 	uint32_t freqs[MAX_UNCORE_FREQS];  /* Frequency array */
@@ -41,7 +41,7 @@ struct uncore_power_info {
 	uint32_t org_max_freq;             /* Original max freq of uncore */
 	uint32_t init_max_freq;            /* System max uncore freq */
 	uint32_t init_min_freq;            /* System min uncore freq */
-} __rte_cache_aligned;
+};
 
 static struct uncore_power_info uncore_info[RTE_MAX_NUMA_NODES][MAX_NUMA_DIE];
 
diff --git a/lib/power/power_pstate_cpufreq.c b/lib/power/power_pstate_cpufreq.c
index 73138dc4e4..2343121621 100644
--- a/lib/power/power_pstate_cpufreq.c
+++ b/lib/power/power_pstate_cpufreq.c
@@ -49,7 +49,7 @@ enum power_state {
 	POWER_UNKNOWN
 };
 
-struct pstate_power_info {
+struct __rte_cache_aligned pstate_power_info {
 	unsigned int lcore_id;               /**< Logical core id */
 	uint32_t freqs[RTE_MAX_LCORE_FREQS]; /**< Frequency array */
 	uint32_t nb_freqs;                   /**< number of available freqs */
@@ -64,7 +64,7 @@ struct pstate_power_info {
 	uint16_t turbo_available;            /**< Turbo Boost available */
 	uint16_t turbo_enable;               /**< Turbo Boost enable/disable */
 	uint16_t priority_core;              /**< High Performance core */
-} __rte_cache_aligned;
+};
 
 
 static struct pstate_power_info lcore_power_info[RTE_MAX_LCORE];
diff --git a/lib/power/rte_power_pmd_mgmt.c b/lib/power/rte_power_pmd_mgmt.c
index 591fc69f36..b1c18a5f56 100644
--- a/lib/power/rte_power_pmd_mgmt.c
+++ b/lib/power/rte_power_pmd_mgmt.c
@@ -55,7 +55,7 @@ struct queue_list_entry {
 	const struct rte_eth_rxtx_callback *cb;
 };
 
-struct pmd_core_cfg {
+struct __rte_cache_aligned pmd_core_cfg {
 	TAILQ_HEAD(queue_list_head, queue_list_entry) head;
 	/**< List of queues associated with this lcore */
 	size_t n_queues;
@@ -68,7 +68,7 @@ struct pmd_core_cfg {
 	/**< Number of queues ready to enter power optimized state */
 	uint64_t sleep_target;
 	/**< Prevent a queue from triggering sleep multiple times */
-} __rte_cache_aligned;
+};
 static struct pmd_core_cfg lcore_cfgs[RTE_MAX_LCORE];
 
 static inline bool
diff --git a/lib/rawdev/rte_rawdev.h b/lib/rawdev/rte_rawdev.h
index 7d5764d971..640037b524 100644
--- a/lib/rawdev/rte_rawdev.h
+++ b/lib/rawdev/rte_rawdev.h
@@ -279,7 +279,7 @@ rte_rawdev_reset(uint16_t dev_id);
  * It is a placeholder for PMD specific data, encapsulating only information
  * related to framework.
  */
-struct rte_rawdev {
+struct __rte_cache_aligned rte_rawdev {
 	/**< Socket ID where memory is allocated */
 	int socket_id;
 	/**< Device ID for this instance */
@@ -300,7 +300,7 @@ struct rte_rawdev {
 	rte_rawdev_obj_t dev_private;
 	/**< Device name */
 	char name[RTE_RAWDEV_NAME_MAX_LEN];
-} __rte_cache_aligned;
+};
 
 /** @internal The pool of rte_rawdev structures. */
 extern struct rte_rawdev *rte_rawdevs;
diff --git a/lib/rcu/rte_rcu_qsbr.h b/lib/rcu/rte_rcu_qsbr.h
index f0b21c5b39..0506191b80 100644
--- a/lib/rcu/rte_rcu_qsbr.h
+++ b/lib/rcu/rte_rcu_qsbr.h
@@ -70,7 +70,7 @@ extern int rte_rcu_log_type;
 #define RTE_QSBR_THRID_INVALID 0xffffffff
 
 /* Worker thread counter */
-struct rte_rcu_qsbr_cnt {
+struct __rte_cache_aligned rte_rcu_qsbr_cnt {
 	RTE_ATOMIC(uint64_t) cnt;
 	/**< Quiescent state counter. Value 0 indicates the thread is offline
 	 *   64b counter is used to avoid adding more code to address
@@ -79,7 +79,7 @@ struct rte_rcu_qsbr_cnt {
 	 */
 	RTE_ATOMIC(uint32_t) lock_cnt;
 	/**< Lock counter. Used when RTE_LIBRTE_RCU_DEBUG is enabled */
-} __rte_cache_aligned;
+};
 
 #define __RTE_QSBR_CNT_THR_OFFLINE 0
 #define __RTE_QSBR_CNT_INIT 1
@@ -92,7 +92,7 @@ struct rte_rcu_qsbr_cnt {
  * 1) Quiescent state counter array
  * 2) Register thread ID array
  */
-struct rte_rcu_qsbr {
+struct __rte_cache_aligned rte_rcu_qsbr {
 	alignas(RTE_CACHE_LINE_SIZE) RTE_ATOMIC(uint64_t) token;
 	/**< Counter to allow for multiple concurrent quiescent state queries */
 	RTE_ATOMIC(uint64_t) acked_token;
@@ -113,7 +113,7 @@ struct rte_rcu_qsbr {
 	/**< Registered thread IDs are stored in a bitmap array,
 	 *   after the quiescent state counter array.
 	 */
-} __rte_cache_aligned;
+};
 
 /**
  * Call back function called to free the resources.
diff --git a/lib/regexdev/rte_regexdev_core.h b/lib/regexdev/rte_regexdev_core.h
index 15ba712b86..32eef6ef56 100644
--- a/lib/regexdev/rte_regexdev_core.h
+++ b/lib/regexdev/rte_regexdev_core.h
@@ -144,13 +144,13 @@ enum rte_regexdev_state {
  * This structure is safe to place in shared memory to be common among different
  * processes in a multi-process configuration.
  */
-struct rte_regexdev_data {
+struct __rte_cache_aligned rte_regexdev_data {
 	void *dev_private; /**< PMD-specific private data. */
 	char dev_name[RTE_REGEXDEV_NAME_MAX_LEN]; /**< Unique identifier name */
 	uint16_t dev_id; /**< Device [external]  identifier. */
 	struct rte_regexdev_config dev_conf; /**< RegEx configuration. */
 	uint8_t dev_started : 1; /**< Device started to work. */
-} __rte_cache_aligned;
+};
 
 /**
  * @internal
@@ -162,7 +162,7 @@ struct rte_regexdev_data {
  * memory. This split allows the function pointer and driver data to be per-
  * process, while the actual configuration data for the device is shared.
  */
-struct rte_regexdev {
+struct __rte_cache_aligned rte_regexdev {
 	regexdev_enqueue_t enqueue;
 	regexdev_dequeue_t dequeue;
 	const struct rte_regexdev_ops *dev_ops;
@@ -170,7 +170,7 @@ struct rte_regexdev {
 	struct rte_device *device; /**< Backing device */
 	enum rte_regexdev_state state; /**< The device state. */
 	struct rte_regexdev_data *data;  /**< Pointer to device data. */
-} __rte_cache_aligned;
+};
 
 /**
  * @internal
diff --git a/lib/reorder/rte_reorder.c b/lib/reorder/rte_reorder.c
index c080b2c858..ae97e1a34d 100644
--- a/lib/reorder/rte_reorder.c
+++ b/lib/reorder/rte_reorder.c
@@ -37,16 +37,16 @@ EAL_REGISTER_TAILQ(rte_reorder_tailq)
 int rte_reorder_seqn_dynfield_offset = -1;
 
 /* A generic circular buffer */
-struct cir_buffer {
+struct __rte_cache_aligned cir_buffer {
 	unsigned int size;   /**< Number of entries that can be stored */
 	unsigned int mask;   /**< [buffer_size - 1]: used for wrap-around */
 	unsigned int head;   /**< insertion point in buffer */
 	unsigned int tail;   /**< extraction point in buffer */
 	struct rte_mbuf **entries;
-} __rte_cache_aligned;
+};
 
 /* The reorder buffer data structure itself */
-struct rte_reorder_buffer {
+struct __rte_cache_aligned rte_reorder_buffer {
 	char name[RTE_REORDER_NAMESIZE];
 	uint32_t min_seqn;  /**< Lowest seq. number that can be in the buffer */
 	unsigned int memsize; /**< memory area size of reorder buffer */
@@ -54,7 +54,7 @@ struct rte_reorder_buffer {
 
 	struct cir_buffer ready_buf; /**< temp buffer for dequeued entries */
 	struct cir_buffer order_buf; /**< buffer used to reorder entries */
-} __rte_cache_aligned;
+};
 
 static void
 rte_reorder_free_mbufs(struct rte_reorder_buffer *b);
diff --git a/lib/ring/rte_ring_core.h b/lib/ring/rte_ring_core.h
index da647eaa66..270869d214 100644
--- a/lib/ring/rte_ring_core.h
+++ b/lib/ring/rte_ring_core.h
@@ -130,20 +130,20 @@ struct rte_ring {
 	RTE_CACHE_GUARD;
 
 	/** Ring producer status. */
-	union {
+	union __rte_cache_aligned {
 		struct rte_ring_headtail prod;
 		struct rte_ring_hts_headtail hts_prod;
 		struct rte_ring_rts_headtail rts_prod;
-	}  __rte_cache_aligned;
+	};
 
 	RTE_CACHE_GUARD;
 
 	/** Ring consumer status. */
-	union {
+	union __rte_cache_aligned {
 		struct rte_ring_headtail cons;
 		struct rte_ring_hts_headtail hts_cons;
 		struct rte_ring_rts_headtail rts_cons;
-	}  __rte_cache_aligned;
+	};
 
 	RTE_CACHE_GUARD;
 };
diff --git a/lib/ring/rte_ring_peek_zc.h b/lib/ring/rte_ring_peek_zc.h
index 8fb279c37e..0b5e34b731 100644
--- a/lib/ring/rte_ring_peek_zc.h
+++ b/lib/ring/rte_ring_peek_zc.h
@@ -79,7 +79,7 @@ extern "C" {
  * This structure contains the pointers and length of the space
  * reserved on the ring storage.
  */
-struct rte_ring_zc_data {
+struct __rte_cache_aligned rte_ring_zc_data {
 	/* Pointer to the first space in the ring */
 	void *ptr1;
 	/* Pointer to the second space in the ring if there is wrap-around.
@@ -92,7 +92,7 @@ struct rte_ring_zc_data {
 	 * will give the number of elements available at ptr2.
 	 */
 	unsigned int n1;
-} __rte_cache_aligned;
+};
 
 static __rte_always_inline void
 __rte_ring_get_elem_addr(struct rte_ring *r, uint32_t head,
diff --git a/lib/sched/rte_sched.c b/lib/sched/rte_sched.c
index 1ba73a4001..bbdb5d1d86 100644
--- a/lib/sched/rte_sched.c
+++ b/lib/sched/rte_sched.c
@@ -58,7 +58,7 @@ struct rte_sched_pipe_profile {
 	uint8_t  wrr_cost[RTE_SCHED_BE_QUEUES_PER_PIPE];
 };
 
-struct rte_sched_pipe {
+struct __rte_cache_aligned rte_sched_pipe {
 	/* Token bucket (TB) */
 	uint64_t tb_time; /* time of last update */
 	uint64_t tb_credits;
@@ -76,7 +76,7 @@ struct rte_sched_pipe {
 	/* TC oversubscription */
 	uint64_t tc_ov_credits;
 	uint8_t tc_ov_period_id;
-} __rte_cache_aligned;
+};
 
 struct rte_sched_queue {
 	uint16_t qw;
@@ -146,7 +146,7 @@ struct rte_sched_grinder {
 	uint8_t wrr_cost[RTE_SCHED_BE_QUEUES_PER_PIPE];
 };
 
-struct rte_sched_subport {
+struct __rte_cache_aligned rte_sched_subport {
 	/* Token bucket (TB) */
 	uint64_t tb_time; /* time of last update */
 	uint64_t tb_credits;
@@ -214,9 +214,9 @@ struct rte_sched_subport {
 	uint8_t *bmp_array;
 	struct rte_mbuf **queue_array;
 	alignas(RTE_CACHE_LINE_SIZE) uint8_t memory[0];
-} __rte_cache_aligned;
+};
 
-struct rte_sched_port {
+struct __rte_cache_aligned rte_sched_port {
 	/* User parameters */
 	uint32_t n_subports_per_port;
 	uint32_t n_pipes_per_subport;
@@ -246,7 +246,7 @@ struct rte_sched_port {
 	/* Large data structures */
 	struct rte_sched_subport_profile *subport_profiles;
 	alignas(RTE_CACHE_LINE_SIZE) struct rte_sched_subport *subports[0];
-} __rte_cache_aligned;
+};
 
 enum rte_sched_subport_array {
 	e_RTE_SCHED_SUBPORT_ARRAY_PIPE = 0,
diff --git a/lib/sched/rte_sched_common.h b/lib/sched/rte_sched_common.h
index 419700b1a5..573d164569 100644
--- a/lib/sched/rte_sched_common.h
+++ b/lib/sched/rte_sched_common.h
@@ -12,8 +12,6 @@ extern "C" {
 #include <stdint.h>
 #include <sys/types.h>
 
-#define __rte_aligned_16 __rte_aligned(16)
-
 #if 0
 static inline uint32_t
 rte_min_pos_4_u16(uint16_t *x)
diff --git a/lib/stack/rte_stack.h b/lib/stack/rte_stack.h
index 1ed8514b07..3325757568 100644
--- a/lib/stack/rte_stack.h
+++ b/lib/stack/rte_stack.h
@@ -73,7 +73,7 @@ struct rte_stack_std {
 /* The RTE stack structure contains the LIFO structure itself, plus metadata
  * such as its name and memzone pointer.
  */
-struct rte_stack {
+struct __rte_cache_aligned rte_stack {
 	/** Name of the stack. */
 	alignas(RTE_CACHE_LINE_SIZE) char name[RTE_STACK_NAMESIZE];
 	/** Memzone containing the rte_stack structure. */
@@ -84,7 +84,7 @@ struct rte_stack {
 		struct rte_stack_lf stack_lf; /**< Lock-free LIFO structure. */
 		struct rte_stack_std stack_std;	/**< LIFO structure. */
 	};
-} __rte_cache_aligned;
+};
 
 /**
  * The stack uses lock-free push and pop functions. This flag is only
diff --git a/lib/table/rte_swx_table_learner.c b/lib/table/rte_swx_table_learner.c
index 2b5e6bdce1..55a3645e06 100644
--- a/lib/table/rte_swx_table_learner.c
+++ b/lib/table/rte_swx_table_learner.c
@@ -145,13 +145,13 @@ struct table_params {
 	size_t total_size;
 };
 
-struct table {
+struct __rte_cache_aligned table {
 	/* Table parameters. */
 	struct table_params params;
 
 	/* Table buckets. */
 	uint8_t buckets[];
-} __rte_cache_aligned;
+};
 
 /* The timeout (in cycles) is stored in the table as a 32-bit value by truncating its least
  * significant 32 bits. Therefore, to make sure the time is always advancing when adding the timeout
diff --git a/lib/table/rte_table_array.c b/lib/table/rte_table_array.c
index 58db67b638..71aada4aaf 100644
--- a/lib/table/rte_table_array.c
+++ b/lib/table/rte_table_array.c
@@ -28,7 +28,7 @@
 
 #endif
 
-struct rte_table_array {
+struct __rte_cache_aligned rte_table_array {
 	struct rte_table_stats stats;
 
 	/* Input parameters */
@@ -41,7 +41,7 @@ struct rte_table_array {
 
 	/* Internal table */
 	alignas(RTE_CACHE_LINE_SIZE) uint8_t array[0];
-} __rte_cache_aligned;
+};
 
 static void *
 rte_table_array_create(void *params, int socket_id, uint32_t entry_size)
diff --git a/lib/timer/rte_timer.c b/lib/timer/rte_timer.c
index 53ed22199d..bb8b6a651d 100644
--- a/lib/timer/rte_timer.c
+++ b/lib/timer/rte_timer.c
@@ -24,7 +24,7 @@
 /**
  * Per-lcore info for timers.
  */
-struct priv_timer {
+struct __rte_cache_aligned priv_timer {
 	struct rte_timer pending_head;  /**< dummy timer instance to head up list */
 	rte_spinlock_t list_lock;       /**< lock to protect list access */
 
@@ -44,7 +44,7 @@ struct priv_timer {
 	/** per-lcore statistics */
 	struct rte_timer_debug_stats stats;
 #endif
-} __rte_cache_aligned;
+};
 
 #define FL_ALLOCATED	(1 << 0)
 struct rte_timer_data {
diff --git a/lib/vhost/vhost.h b/lib/vhost/vhost.h
index 08e4ab9b13..cd3fa55f1b 100644
--- a/lib/vhost/vhost.h
+++ b/lib/vhost/vhost.h
@@ -272,7 +272,7 @@ struct vhost_async {
 /**
  * Structure contains variables relevant to RX/TX virtqueues.
  */
-struct vhost_virtqueue {
+struct __rte_cache_aligned vhost_virtqueue {
 	union {
 		struct vring_desc	*desc;
 		struct vring_packed_desc   *desc_packed;
@@ -351,7 +351,7 @@ struct vhost_virtqueue {
 	struct virtqueue_stats	stats;
 
 	RTE_ATOMIC(bool) irq_pending;
-} __rte_cache_aligned;
+};
 
 /* Virtio device status as per Virtio specification */
 #define VIRTIO_DEVICE_STATUS_RESET		0x00
@@ -479,7 +479,7 @@ struct inflight_mem_info {
  * Device structure contains all configuration information relating
  * to the device.
  */
-struct virtio_net {
+struct __rte_cache_aligned virtio_net {
 	/* Frontend (QEMU) memory and memory region information */
 	struct rte_vhost_memory	*mem;
 	uint64_t		features;
@@ -538,7 +538,7 @@ struct virtio_net {
 	struct rte_vhost_user_extern_ops extern_ops;
 
 	struct vhost_backend_ops *backend_ops;
-} __rte_cache_aligned;
+};
 
 static inline void
 vq_assert_lock__(struct virtio_net *dev, struct vhost_virtqueue *vq, const char *func)
diff --git a/lib/vhost/vhost_crypto.c b/lib/vhost/vhost_crypto.c
index 75f1a9a8b0..7caf6d9afa 100644
--- a/lib/vhost/vhost_crypto.c
+++ b/lib/vhost/vhost_crypto.c
@@ -191,7 +191,7 @@ static int get_iv_len(enum rte_crypto_cipher_algorithm algo)
  * one DPDK crypto device that deals with all crypto workloads. It is declared
  * here and defined in vhost_crypto.c
  */
-struct vhost_crypto {
+struct __rte_cache_aligned vhost_crypto {
 	/** Used to lookup DPDK Cryptodev Session based on VIRTIO crypto
 	 *  session ID.
 	 */
@@ -214,7 +214,7 @@ struct vhost_crypto {
 	struct virtio_net *dev;
 
 	uint8_t option;
-} __rte_cache_aligned;
+};
 
 struct vhost_crypto_writeback_data {
 	uint8_t *src;