diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2021-04-26 19:44:16 +0200 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2021-04-26 19:44:16 +0200 |
commit | 4d480dbf21f3385e9957b1ee8dadee35548f4516 (patch) | |
tree | 73a61f83b520c71cd8d63dd9bab0cace25f3ae56 /drivers | |
parent | Merge tag 'for-linus-5.13-rc1-tag' of git://git.kernel.org/pub/scm/linux/kern... (diff) | |
parent | drivers: hv: Create a consistent pattern for checking Hyper-V hypercall status (diff) | |
download | linux-4d480dbf21f3385e9957b1ee8dadee35548f4516.tar.xz linux-4d480dbf21f3385e9957b1ee8dadee35548f4516.zip |
Merge tag 'hyperv-next-signed-20210426' of git://git.kernel.org/pub/scm/linux/kernel/git/hyperv/linux
Pull Hyper-V updates from Wei Liu:
- VMBus enhancement
- Free page reporting support for Hyper-V balloon driver
- Some patches for running Linux as Arm64 Hyper-V guest
- A few misc clean-up patches
* tag 'hyperv-next-signed-20210426' of git://git.kernel.org/pub/scm/linux/kernel/git/hyperv/linux: (30 commits)
drivers: hv: Create a consistent pattern for checking Hyper-V hypercall status
x86/hyperv: Move hv_do_rep_hypercall to asm-generic
video: hyperv_fb: Add ratelimit on error message
Drivers: hv: vmbus: Increase wait time for VMbus unload
Drivers: hv: vmbus: Initialize unload_event statically
Drivers: hv: vmbus: Check for pending channel interrupts before taking a CPU offline
Drivers: hv: vmbus: Drivers: hv: vmbus: Introduce CHANNELMSG_MODIFYCHANNEL_RESPONSE
Drivers: hv: vmbus: Introduce and negotiate VMBus protocol version 5.3
Drivers: hv: vmbus: Use after free in __vmbus_open()
Drivers: hv: vmbus: remove unused function
Drivers: hv: vmbus: Remove unused linux/version.h header
x86/hyperv: remove unused linux/version.h header
x86/Hyper-V: Support for free page reporting
x86/hyperv: Fix unused variable 'hi' warning in hv_apic_read
x86/hyperv: Fix unused variable 'msr_val' warning in hv_qlock_wait
hv: hyperv.h: a few mundane typo fixes
drivers: hv: Fix EXPORT_SYMBOL and tab spaces issue
Drivers: hv: vmbus: Drop error message when 'No request id available'
asm-generic/hyperv: Add missing function prototypes per -W1 warnings
clocksource/drivers/hyper-v: Move handling of STIMER0 interrupts
...
Diffstat (limited to 'drivers')
-rw-r--r-- | drivers/clocksource/hyperv_timer.c | 249 | ||||
-rw-r--r-- | drivers/hv/Kconfig | 1 | ||||
-rw-r--r-- | drivers/hv/channel.c | 103 | ||||
-rw-r--r-- | drivers/hv/channel_mgmt.c | 86 | ||||
-rw-r--r-- | drivers/hv/connection.c | 7 | ||||
-rw-r--r-- | drivers/hv/hv.c | 152 | ||||
-rw-r--r-- | drivers/hv/hv_balloon.c | 89 | ||||
-rw-r--r-- | drivers/hv/hv_trace.h | 15 | ||||
-rw-r--r-- | drivers/hv/ring_buffer.c | 10 | ||||
-rw-r--r-- | drivers/hv/vmbus_drv.c | 93 | ||||
-rw-r--r-- | drivers/pci/controller/pci-hyperv.c | 2 | ||||
-rw-r--r-- | drivers/video/fbdev/hyperv_fb.c | 2 |
12 files changed, 651 insertions, 158 deletions
diff --git a/drivers/clocksource/hyperv_timer.c b/drivers/clocksource/hyperv_timer.c index a02b0a224807..977fd05ac35f 100644 --- a/drivers/clocksource/hyperv_timer.c +++ b/drivers/clocksource/hyperv_timer.c @@ -18,6 +18,9 @@ #include <linux/sched_clock.h> #include <linux/mm.h> #include <linux/cpuhotplug.h> +#include <linux/interrupt.h> +#include <linux/irq.h> +#include <linux/acpi.h> #include <clocksource/hyperv_timer.h> #include <asm/hyperv-tlfs.h> #include <asm/mshyperv.h> @@ -43,14 +46,13 @@ static u64 hv_sched_clock_offset __ro_after_init; */ static bool direct_mode_enabled; -static int stimer0_irq; -static int stimer0_vector; +static int stimer0_irq = -1; static int stimer0_message_sint; +static DEFINE_PER_CPU(long, stimer0_evt); /* - * ISR for when stimer0 is operating in Direct Mode. Direct Mode - * does not use VMbus or any VMbus messages, so process here and not - * in the VMbus driver code. + * Common code for stimer0 interrupts coming via Direct Mode or + * as a VMbus message. */ void hv_stimer0_isr(void) { @@ -61,6 +63,16 @@ void hv_stimer0_isr(void) } EXPORT_SYMBOL_GPL(hv_stimer0_isr); +/* + * stimer0 interrupt handler for architectures that support + * per-cpu interrupts, which also implies Direct Mode. + */ +static irqreturn_t hv_stimer0_percpu_isr(int irq, void *dev_id) +{ + hv_stimer0_isr(); + return IRQ_HANDLED; +} + static int hv_ce_set_next_event(unsigned long delta, struct clock_event_device *evt) { @@ -68,16 +80,16 @@ static int hv_ce_set_next_event(unsigned long delta, current_tick = hv_read_reference_counter(); current_tick += delta; - hv_init_timer(0, current_tick); + hv_set_register(HV_REGISTER_STIMER0_COUNT, current_tick); return 0; } static int hv_ce_shutdown(struct clock_event_device *evt) { - hv_init_timer(0, 0); - hv_init_timer_config(0, 0); - if (direct_mode_enabled) - hv_disable_stimer0_percpu_irq(stimer0_irq); + hv_set_register(HV_REGISTER_STIMER0_COUNT, 0); + hv_set_register(HV_REGISTER_STIMER0_CONFIG, 0); + if (direct_mode_enabled && stimer0_irq >= 0) + disable_percpu_irq(stimer0_irq); return 0; } @@ -95,8 +107,9 @@ static int hv_ce_set_oneshot(struct clock_event_device *evt) * on the specified hardware vector/IRQ. */ timer_cfg.direct_mode = 1; - timer_cfg.apic_vector = stimer0_vector; - hv_enable_stimer0_percpu_irq(stimer0_irq); + timer_cfg.apic_vector = HYPERV_STIMER0_VECTOR; + if (stimer0_irq >= 0) + enable_percpu_irq(stimer0_irq, IRQ_TYPE_NONE); } else { /* * When it expires, the timer will generate a VMbus message, @@ -105,7 +118,7 @@ static int hv_ce_set_oneshot(struct clock_event_device *evt) timer_cfg.direct_mode = 0; timer_cfg.sintx = stimer0_message_sint; } - hv_init_timer_config(0, timer_cfg.as_uint64); + hv_set_register(HV_REGISTER_STIMER0_CONFIG, timer_cfg.as_uint64); return 0; } @@ -169,10 +182,58 @@ int hv_stimer_cleanup(unsigned int cpu) } EXPORT_SYMBOL_GPL(hv_stimer_cleanup); +/* + * These placeholders are overridden by arch specific code on + * architectures that need special setup of the stimer0 IRQ because + * they don't support per-cpu IRQs (such as x86/x64). + */ +void __weak hv_setup_stimer0_handler(void (*handler)(void)) +{ +}; + +void __weak hv_remove_stimer0_handler(void) +{ +}; + +/* Called only on architectures with per-cpu IRQs (i.e., not x86/x64) */ +static int hv_setup_stimer0_irq(void) +{ + int ret; + + ret = acpi_register_gsi(NULL, HYPERV_STIMER0_VECTOR, + ACPI_EDGE_SENSITIVE, ACPI_ACTIVE_HIGH); + if (ret < 0) { + pr_err("Can't register Hyper-V stimer0 GSI. Error %d", ret); + return ret; + } + stimer0_irq = ret; + + ret = request_percpu_irq(stimer0_irq, hv_stimer0_percpu_isr, + "Hyper-V stimer0", &stimer0_evt); + if (ret) { + pr_err("Can't request Hyper-V stimer0 IRQ %d. Error %d", + stimer0_irq, ret); + acpi_unregister_gsi(stimer0_irq); + stimer0_irq = -1; + } + return ret; +} + +static void hv_remove_stimer0_irq(void) +{ + if (stimer0_irq == -1) { + hv_remove_stimer0_handler(); + } else { + free_percpu_irq(stimer0_irq, &stimer0_evt); + acpi_unregister_gsi(stimer0_irq); + stimer0_irq = -1; + } +} + /* hv_stimer_alloc - Global initialization of the clockevent and stimer0 */ -int hv_stimer_alloc(void) +int hv_stimer_alloc(bool have_percpu_irqs) { - int ret = 0; + int ret; /* * Synthetic timers are always available except on old versions of @@ -188,29 +249,37 @@ int hv_stimer_alloc(void) direct_mode_enabled = ms_hyperv.misc_features & HV_STIMER_DIRECT_MODE_AVAILABLE; - if (direct_mode_enabled) { - ret = hv_setup_stimer0_irq(&stimer0_irq, &stimer0_vector, - hv_stimer0_isr); + + /* + * If Direct Mode isn't enabled, the remainder of the initialization + * is done later by hv_stimer_legacy_init() + */ + if (!direct_mode_enabled) + return 0; + + if (have_percpu_irqs) { + ret = hv_setup_stimer0_irq(); if (ret) - goto free_percpu; + goto free_clock_event; + } else { + hv_setup_stimer0_handler(hv_stimer0_isr); + } - /* - * Since we are in Direct Mode, stimer initialization - * can be done now with a CPUHP value in the same range - * as other clockevent devices. - */ - ret = cpuhp_setup_state(CPUHP_AP_HYPERV_TIMER_STARTING, - "clockevents/hyperv/stimer:starting", - hv_stimer_init, hv_stimer_cleanup); - if (ret < 0) - goto free_stimer0_irq; + /* + * Since we are in Direct Mode, stimer initialization + * can be done now with a CPUHP value in the same range + * as other clockevent devices. + */ + ret = cpuhp_setup_state(CPUHP_AP_HYPERV_TIMER_STARTING, + "clockevents/hyperv/stimer:starting", + hv_stimer_init, hv_stimer_cleanup); + if (ret < 0) { + hv_remove_stimer0_irq(); + goto free_clock_event; } return ret; -free_stimer0_irq: - hv_remove_stimer0_irq(stimer0_irq); - stimer0_irq = 0; -free_percpu: +free_clock_event: free_percpu(hv_clock_event); hv_clock_event = NULL; return ret; @@ -254,23 +323,6 @@ void hv_stimer_legacy_cleanup(unsigned int cpu) } EXPORT_SYMBOL_GPL(hv_stimer_legacy_cleanup); - -/* hv_stimer_free - Free global resources allocated by hv_stimer_alloc() */ -void hv_stimer_free(void) -{ - if (!hv_clock_event) - return; - - if (direct_mode_enabled) { - cpuhp_remove_state(CPUHP_AP_HYPERV_TIMER_STARTING); - hv_remove_stimer0_irq(stimer0_irq); - stimer0_irq = 0; - } - free_percpu(hv_clock_event); - hv_clock_event = NULL; -} -EXPORT_SYMBOL_GPL(hv_stimer_free); - /* * Do a global cleanup of clockevents for the cases of kexec and * vmbus exit @@ -287,12 +339,17 @@ void hv_stimer_global_cleanup(void) hv_stimer_legacy_cleanup(cpu); } - /* - * If Direct Mode is enabled, the cpuhp teardown callback - * (hv_stimer_cleanup) will be run on all CPUs to stop the - * stimers. - */ - hv_stimer_free(); + if (!hv_clock_event) + return; + + if (direct_mode_enabled) { + cpuhp_remove_state(CPUHP_AP_HYPERV_TIMER_STARTING); + hv_remove_stimer0_irq(); + stimer0_irq = -1; + } + free_percpu(hv_clock_event); + hv_clock_event = NULL; + } EXPORT_SYMBOL_GPL(hv_stimer_global_cleanup); @@ -302,14 +359,6 @@ EXPORT_SYMBOL_GPL(hv_stimer_global_cleanup); * the other that uses the TSC reference page feature as defined in the * TLFS. The MSR version is for compatibility with old versions of * Hyper-V and 32-bit x86. The TSC reference page version is preferred. - * - * The Hyper-V clocksource ratings of 250 are chosen to be below the - * TSC clocksource rating of 300. In configurations where Hyper-V offers - * an InvariantTSC, the TSC is not marked "unstable", so the TSC clocksource - * is available and preferred. With the higher rating, it will be the - * default. On older hardware and Hyper-V versions, the TSC is marked - * "unstable", so no TSC clocksource is created and the selected Hyper-V - * clocksource will be the default. */ u64 (*hv_read_reference_counter)(void); @@ -331,7 +380,7 @@ static u64 notrace read_hv_clock_tsc(void) u64 current_tick = hv_read_tsc_page(hv_get_tsc_page()); if (current_tick == U64_MAX) - hv_get_time_ref_count(current_tick); + current_tick = hv_get_register(HV_REGISTER_TIME_REF_COUNT); return current_tick; } @@ -352,9 +401,9 @@ static void suspend_hv_clock_tsc(struct clocksource *arg) u64 tsc_msr; /* Disable the TSC page */ - hv_get_reference_tsc(tsc_msr); + tsc_msr = hv_get_register(HV_REGISTER_REFERENCE_TSC); tsc_msr &= ~BIT_ULL(0); - hv_set_reference_tsc(tsc_msr); + hv_set_register(HV_REGISTER_REFERENCE_TSC, tsc_msr); } @@ -364,39 +413,44 @@ static void resume_hv_clock_tsc(struct clocksource *arg) u64 tsc_msr; /* Re-enable the TSC page */ - hv_get_reference_tsc(tsc_msr); + tsc_msr = hv_get_register(HV_REGISTER_REFERENCE_TSC); tsc_msr &= GENMASK_ULL(11, 0); tsc_msr |= BIT_ULL(0) | (u64)phys_addr; - hv_set_reference_tsc(tsc_msr); + hv_set_register(HV_REGISTER_REFERENCE_TSC, tsc_msr); } +#ifdef VDSO_CLOCKMODE_HVCLOCK static int hv_cs_enable(struct clocksource *cs) { - hv_enable_vdso_clocksource(); + vclocks_set_used(VDSO_CLOCKMODE_HVCLOCK); return 0; } +#endif static struct clocksource hyperv_cs_tsc = { .name = "hyperv_clocksource_tsc_page", - .rating = 250, + .rating = 500, .read = read_hv_clock_tsc_cs, .mask = CLOCKSOURCE_MASK(64), .flags = CLOCK_SOURCE_IS_CONTINUOUS, .suspend= suspend_hv_clock_tsc, .resume = resume_hv_clock_tsc, +#ifdef VDSO_CLOCKMODE_HVCLOCK .enable = hv_cs_enable, + .vdso_clock_mode = VDSO_CLOCKMODE_HVCLOCK, +#else + .vdso_clock_mode = VDSO_CLOCKMODE_NONE, +#endif }; static u64 notrace read_hv_clock_msr(void) { - u64 current_tick; /* * Read the partition counter to get the current tick count. This count * is set to 0 when the partition is created and is incremented in * 100 nanosecond units. */ - hv_get_time_ref_count(current_tick); - return current_tick; + return hv_get_register(HV_REGISTER_TIME_REF_COUNT); } static u64 notrace read_hv_clock_msr_cs(struct clocksource *arg) @@ -412,12 +466,36 @@ static u64 notrace read_hv_sched_clock_msr(void) static struct clocksource hyperv_cs_msr = { .name = "hyperv_clocksource_msr", - .rating = 250, + .rating = 500, .read = read_hv_clock_msr_cs, .mask = CLOCKSOURCE_MASK(64), .flags = CLOCK_SOURCE_IS_CONTINUOUS, }; +/* + * Reference to pv_ops must be inline so objtool + * detection of noinstr violations can work correctly. + */ +#ifdef CONFIG_GENERIC_SCHED_CLOCK +static __always_inline void hv_setup_sched_clock(void *sched_clock) +{ + /* + * We're on an architecture with generic sched clock (not x86/x64). + * The Hyper-V sched clock read function returns nanoseconds, not + * the normal 100ns units of the Hyper-V synthetic clock. + */ + sched_clock_register(sched_clock, 64, NSEC_PER_SEC); +} +#elif defined CONFIG_PARAVIRT +static __always_inline void hv_setup_sched_clock(void *sched_clock) +{ + /* We're on x86/x64 *and* using PV ops */ + paravirt_set_sched_clock(sched_clock); +} +#else /* !CONFIG_GENERIC_SCHED_CLOCK && !CONFIG_PARAVIRT */ +static __always_inline void hv_setup_sched_clock(void *sched_clock) {} +#endif /* CONFIG_GENERIC_SCHED_CLOCK */ + static bool __init hv_init_tsc_clocksource(void) { u64 tsc_msr; @@ -429,6 +507,22 @@ static bool __init hv_init_tsc_clocksource(void) if (hv_root_partition) return false; + /* + * If Hyper-V offers TSC_INVARIANT, then the virtualized TSC correctly + * handles frequency and offset changes due to live migration, + * pause/resume, and other VM management operations. So lower the + * Hyper-V Reference TSC rating, causing the generic TSC to be used. + * TSC_INVARIANT is not offered on ARM64, so the Hyper-V Reference + * TSC will be preferred over the virtualized ARM64 arch counter. + * While the Hyper-V MSR clocksource won't be used since the + * Reference TSC clocksource is present, change its rating as + * well for consistency. + */ + if (ms_hyperv.features & HV_ACCESS_TSC_INVARIANT) { + hyperv_cs_tsc.rating = 250; + hyperv_cs_msr.rating = 250; + } + hv_read_reference_counter = read_hv_clock_tsc; phys_addr = virt_to_phys(hv_get_tsc_page()); @@ -439,12 +533,11 @@ static bool __init hv_init_tsc_clocksource(void) * (which already has at least the low 12 bits set to zero since * it is page aligned). Also set the "enable" bit, which is bit 0. */ - hv_get_reference_tsc(tsc_msr); + tsc_msr = hv_get_register(HV_REGISTER_REFERENCE_TSC); tsc_msr &= GENMASK_ULL(11, 0); tsc_msr = tsc_msr | 0x1 | (u64)phys_addr; - hv_set_reference_tsc(tsc_msr); + hv_set_register(HV_REGISTER_REFERENCE_TSC, tsc_msr); - hv_set_clocksource_vdso(hyperv_cs_tsc); clocksource_register_hz(&hyperv_cs_tsc, NSEC_PER_SEC/100); hv_sched_clock_offset = hv_read_reference_counter(); diff --git a/drivers/hv/Kconfig b/drivers/hv/Kconfig index 79e5356a737a..66c794d92391 100644 --- a/drivers/hv/Kconfig +++ b/drivers/hv/Kconfig @@ -23,6 +23,7 @@ config HYPERV_UTILS config HYPERV_BALLOON tristate "Microsoft Hyper-V Balloon driver" depends on HYPERV + select PAGE_REPORTING help Select this option to enable Hyper-V Balloon driver. diff --git a/drivers/hv/channel.c b/drivers/hv/channel.c index 0bd202de7960..c2635e913a92 100644 --- a/drivers/hv/channel.c +++ b/drivers/hv/channel.c @@ -209,31 +209,96 @@ int vmbus_send_tl_connect_request(const guid_t *shv_guest_servie_id, } EXPORT_SYMBOL_GPL(vmbus_send_tl_connect_request); +static int send_modifychannel_without_ack(struct vmbus_channel *channel, u32 target_vp) +{ + struct vmbus_channel_modifychannel msg; + int ret; + + memset(&msg, 0, sizeof(msg)); + msg.header.msgtype = CHANNELMSG_MODIFYCHANNEL; + msg.child_relid = channel->offermsg.child_relid; + msg.target_vp = target_vp; + + ret = vmbus_post_msg(&msg, sizeof(msg), true); + trace_vmbus_send_modifychannel(&msg, ret); + + return ret; +} + +static int send_modifychannel_with_ack(struct vmbus_channel *channel, u32 target_vp) +{ + struct vmbus_channel_modifychannel *msg; + struct vmbus_channel_msginfo *info; + unsigned long flags; + int ret; + + info = kzalloc(sizeof(struct vmbus_channel_msginfo) + + sizeof(struct vmbus_channel_modifychannel), + GFP_KERNEL); + if (!info) + return -ENOMEM; + + init_completion(&info->waitevent); + info->waiting_channel = channel; + + msg = (struct vmbus_channel_modifychannel *)info->msg; + msg->header.msgtype = CHANNELMSG_MODIFYCHANNEL; + msg->child_relid = channel->offermsg.child_relid; + msg->target_vp = target_vp; + + spin_lock_irqsave(&vmbus_connection.channelmsg_lock, flags); + list_add_tail(&info->msglistentry, &vmbus_connection.chn_msg_list); + spin_unlock_irqrestore(&vmbus_connection.channelmsg_lock, flags); + + ret = vmbus_post_msg(msg, sizeof(*msg), true); + trace_vmbus_send_modifychannel(msg, ret); + if (ret != 0) { + spin_lock_irqsave(&vmbus_connection.channelmsg_lock, flags); + list_del(&info->msglistentry); + spin_unlock_irqrestore(&vmbus_connection.channelmsg_lock, flags); + goto free_info; + } + + /* + * Release channel_mutex; otherwise, vmbus_onoffer_rescind() could block on + * the mutex and be unable to signal the completion. + * + * See the caller target_cpu_store() for information about the usage of the + * mutex. + */ + mutex_unlock(&vmbus_connection.channel_mutex); + wait_for_completion(&info->waitevent); + mutex_lock(&vmbus_connection.channel_mutex); + + spin_lock_irqsave(&vmbus_connection.channelmsg_lock, flags); + list_del(&info->msglistentry); + spin_unlock_irqrestore(&vmbus_connection.channelmsg_lock, flags); + + if (info->response.modify_response.status) + ret = -EAGAIN; + +free_info: + kfree(info); + return ret; +} + /* * Set/change the vCPU (@target_vp) the channel (@child_relid) will interrupt. * - * CHANNELMSG_MODIFYCHANNEL messages are aynchronous. Also, Hyper-V does not - * ACK such messages. IOW we can't know when the host will stop interrupting - * the "old" vCPU and start interrupting the "new" vCPU for the given channel. + * CHANNELMSG_MODIFYCHANNEL messages are aynchronous. When VMbus version 5.3 + * or later is negotiated, Hyper-V always sends an ACK in response to such a + * message. For VMbus version 5.2 and earlier, it never sends an ACK. With- + * out an ACK, we can not know when the host will stop interrupting the "old" + * vCPU and start interrupting the "new" vCPU for the given channel. * * The CHANNELMSG_MODIFYCHANNEL message type is supported since VMBus version * VERSION_WIN10_V4_1. */ -int vmbus_send_modifychannel(u32 child_relid, u32 target_vp) +int vmbus_send_modifychannel(struct vmbus_channel *channel, u32 target_vp) { - struct vmbus_channel_modifychannel conn_msg; - int ret; - - memset(&conn_msg, 0, sizeof(conn_msg)); - conn_msg.header.msgtype = CHANNELMSG_MODIFYCHANNEL; - conn_msg.child_relid = child_relid; - conn_msg.target_vp = target_vp; - - ret = vmbus_post_msg(&conn_msg, sizeof(conn_msg), true); - - trace_vmbus_send_modifychannel(&conn_msg, ret); - - return ret; + if (vmbus_proto_version >= VERSION_WIN10_V5_3) + return send_modifychannel_with_ack(channel, target_vp); + return send_modifychannel_without_ack(channel, target_vp); } EXPORT_SYMBOL_GPL(vmbus_send_modifychannel); @@ -385,7 +450,7 @@ nomem: * @kbuffer: from kmalloc or vmalloc * @size: page-size multiple * @send_offset: the offset (in bytes) where the send ring buffer starts, - * should be 0 for BUFFER type gpadl + * should be 0 for BUFFER type gpadl * @gpadl_handle: some funky thing */ static int __vmbus_establish_gpadl(struct vmbus_channel *channel, @@ -653,7 +718,7 @@ static int __vmbus_open(struct vmbus_channel *newchannel, if (newchannel->rescind) { err = -ENODEV; - goto error_free_info; + goto error_clean_msglist; } err = vmbus_post_msg(open_msg, diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c index f0ed730e2e4e..caf6d0c4bc1b 100644 --- a/drivers/hv/channel_mgmt.c +++ b/drivers/hv/channel_mgmt.c @@ -333,7 +333,6 @@ fw_error: negop->icversion_data[1].minor = icmsg_minor; return found_match; } - EXPORT_SYMBOL_GPL(vmbus_prep_negotiate_resp); /* @@ -593,10 +592,10 @@ static void vmbus_process_offer(struct vmbus_channel *newchannel) * CPUS_READ_UNLOCK CPUS_WRITE_UNLOCK * * Forbids: CPU1's LOAD from *not* seing CPU2's STORE && - * CPU2's SEARCH from *not* seeing CPU1's INSERT + * CPU2's SEARCH from *not* seeing CPU1's INSERT * * Forbids: CPU2's SEARCH from seeing CPU1's INSERT && - * CPU2's LOAD from *not* seing CPU1's STORE + * CPU2's LOAD from *not* seing CPU1's STORE */ cpus_read_lock(); @@ -756,6 +755,12 @@ static void init_vp_index(struct vmbus_channel *channel) free_cpumask_var(available_mask); } +#define UNLOAD_DELAY_UNIT_MS 10 /* 10 milliseconds */ +#define UNLOAD_WAIT_MS (100*1000) /* 100 seconds */ +#define UNLOAD_WAIT_LOOPS (UNLOAD_WAIT_MS/UNLOAD_DELAY_UNIT_MS) +#define UNLOAD_MSG_MS (5*1000) /* Every 5 seconds */ +#define UNLOAD_MSG_LOOPS (UNLOAD_MSG_MS/UNLOAD_DELAY_UNIT_MS) + static void vmbus_wait_for_unload(void) { int cpu; @@ -773,12 +778,17 @@ static void vmbus_wait_for_unload(void) * vmbus_connection.unload_event. If not, the last thing we can do is * read message pages for all CPUs directly. * - * Wait no more than 10 seconds so that the panic path can't get - * hung forever in case the response message isn't seen. + * Wait up to 100 seconds since an Azure host must writeback any dirty + * data in its disk cache before the VMbus UNLOAD request will + * complete. This flushing has been empirically observed to take up + * to 50 seconds in cases with a lot of dirty data, so allow additional + * leeway and for inaccuracies in mdelay(). But eventually time out so + * that the panic path can't get hung forever in case the response + * message isn't seen. */ - for (i = 0; i < 1000; i++) { + for (i = 1; i <= UNLOAD_WAIT_LOOPS; i++) { if (completion_done(&vmbus_connection.unload_event)) - break; + goto completed; for_each_online_cpu(cpu) { struct hv_per_cpu_context *hv_cpu @@ -801,9 +811,18 @@ static void vmbus_wait_for_unload(void) vmbus_signal_eom(msg, message_type); } - mdelay(10); + /* + * Give a notice periodically so someone watching the + * serial output won't think it is completely hung. + */ + if (!(i % UNLOAD_MSG_LOOPS)) + pr_notice("Waiting for VMBus UNLOAD to complete\n"); + + mdelay(UNLOAD_DELAY_UNIT_MS); } + pr_err("Continuing even though VMBus UNLOAD did not complete\n"); +completed: /* * We're crashing and already got the UNLOAD_RESPONSE, cleanup all * maybe-pending messages on all CPUs to be able to receive new @@ -827,6 +846,11 @@ static void vmbus_unload_response(struct vmbus_channel_message_header *hdr) /* * This is a global event; just wakeup the waiting thread. * Once we successfully unload, we can cleanup the monitor state. + * + * NB. A malicious or compromised Hyper-V could send a spurious + * message of type CHANNELMSG_UNLOAD_RESPONSE, and trigger a call + * of the complete() below. Make sure that unload_event has been + * initialized by the time this complete() is executed. */ complete(&vmbus_connection.unload_event); } @@ -842,7 +866,7 @@ void vmbus_initiate_unload(bool crash) if (vmbus_proto_version < VERSION_WIN8_1) return; - init_completion(&vmbus_connection.unload_event); + reinit_completion(&vmbus_connection.unload_event); memset(&hdr, 0, sizeof(struct vmbus_channel_message_header)); hdr.msgtype = CHANNELMSG_UNLOAD; vmbus_post_msg(&hdr, sizeof(struct vmbus_channel_message_header), @@ -980,7 +1004,7 @@ static void vmbus_onoffer(struct vmbus_channel_message_header *hdr) * UNLOCK channel_mutex * * Forbids: r1 == valid_relid && - * channels[valid_relid] == channel + * channels[valid_relid] == channel * * Note. r1 can be INVALID_RELID only for an hv_sock channel. * None of the hv_sock channels which were present before the @@ -1313,6 +1337,46 @@ static void vmbus_ongpadl_created(struct vmbus_channel_message_header *hdr) } /* + * vmbus_onmodifychannel_response - Modify Channel response handler. + * + * This is invoked when we received a response to our channel modify request. + * Find the matching request, copy the response and signal the requesting thread. + */ +static void vmbus_onmodifychannel_response(struct vmbus_channel_message_header *hdr) +{ + struct vmbus_channel_modifychannel_response *response; + struct vmbus_channel_msginfo *msginfo; + unsigned long flags; + + response = (struct vmbus_channel_modifychannel_response *)hdr; + + trace_vmbus_onmodifychannel_response(response); + + /* + * Find the modify msg, copy the response and signal/unblock the wait event. + */ + spin_lock_irqsave(&vmbus_connection.channelmsg_lock, flags); + + list_for_each_entry(msginfo, &vmbus_connection.chn_msg_list, msglistentry) { + struct vmbus_channel_message_header *responseheader = + (struct vmbus_channel_message_header *)msginfo->msg; + + if (responseheader->msgtype == CHANNELMSG_MODIFYCHANNEL) { + struct vmbus_channel_modifychannel *modifymsg; + + modifymsg = (struct vmbus_channel_modifychannel *)msginfo->msg; + if (modifymsg->child_relid == response->child_relid) { + memcpy(&msginfo->response.modify_response, response, + sizeof(*response)); + complete(&msginfo->waitevent); + break; + } + } + } + spin_unlock_irqrestore(&vmbus_connection.channelmsg_lock, flags); +} + +/* * vmbus_ongpadl_torndown - GPADL torndown handler. * * This is invoked when we received a response to our gpadl teardown request. @@ -1429,6 +1493,8 @@ channel_message_table[CHANNELMSG_COUNT] = { { CHANNELMSG_TL_CONNECT_REQUEST, 0, NULL, 0}, { CHANNELMSG_MODIFYCHANNEL, 0, NULL, 0}, { CHANNELMSG_TL_CONNECT_RESULT, 0, NULL, 0}, + { CHANNELMSG_MODIFYCHANNEL_RESPONSE, 1, vmbus_onmodifychannel_response, + sizeof(struct vmbus_channel_modifychannel_response)}, }; /* diff --git a/drivers/hv/connection.c b/drivers/hv/connection.c index c83612cddb99..311cd005b3be 100644 --- a/drivers/hv/connection.c +++ b/drivers/hv/connection.c @@ -26,9 +26,11 @@ struct vmbus_connection vmbus_connection = { .conn_state = DISCONNECTED, + .unload_event = COMPLETION_INITIALIZER( + vmbus_connection.unload_event), .next_gpadl_handle = ATOMIC_INIT(0xE1E10), - .ready_for_suspend_event= COMPLETION_INITIALIZER( + .ready_for_suspend_event = COMPLETION_INITIALIZER( vmbus_connection.ready_for_suspend_event), .ready_for_resume_event = COMPLETION_INITIALIZER( vmbus_connection.ready_for_resume_event), @@ -45,6 +47,7 @@ EXPORT_SYMBOL_GPL(vmbus_proto_version); * Table of VMBus versions listed from newest to oldest. */ static __u32 vmbus_versions[] = { + VERSION_WIN10_V5_3, VERSION_WIN10_V5_2, VERSION_WIN10_V5_1, VERSION_WIN10_V5, @@ -60,7 +63,7 @@ static __u32 vmbus_versions[] = { * Maximal VMBus protocol version guests can negotiate. Useful to cap the * VMBus version for testing and debugging purpose. */ -static uint max_version = VERSION_WIN10_V5_2; +static uint max_version = VERSION_WIN10_V5_3; module_param(max_version, uint, S_IRUGO); MODULE_PARM_DESC(max_version, diff --git a/drivers/hv/hv.c b/drivers/hv/hv.c index f202ac7f4b3d..e83507f49676 100644 --- a/drivers/hv/hv.c +++ b/drivers/hv/hv.c @@ -13,9 +13,10 @@ #include <linux/slab.h> #include <linux/vmalloc.h> #include <linux/hyperv.h> -#include <linux/version.h> #include <linux/random.h> #include <linux/clockchips.h> +#include <linux/delay.h> +#include <linux/interrupt.h> #include <clocksource/hyperv_timer.h> #include <asm/mshyperv.h> #include "hyperv_vmbus.h" @@ -37,6 +38,42 @@ int hv_init(void) } /* + * Functions for allocating and freeing memory with size and + * alignment HV_HYP_PAGE_SIZE. These functions are needed because + * the guest page size may not be the same as the Hyper-V page + * size. We depend upon kmalloc() aligning power-of-two size + * allocations to the allocation size boundary, so that the + * allocated memory appears to Hyper-V as a page of the size + * it expects. + */ + +void *hv_alloc_hyperv_page(void) +{ + BUILD_BUG_ON(PAGE_SIZE < HV_HYP_PAGE_SIZE); + + if (PAGE_SIZE == HV_HYP_PAGE_SIZE) + return (void *)__get_free_page(GFP_KERNEL); + else + return kmalloc(HV_HYP_PAGE_SIZE, GFP_KERNEL); +} + +void *hv_alloc_hyperv_zeroed_page(void) +{ + if (PAGE_SIZE == HV_HYP_PAGE_SIZE) + return (void *)__get_free_page(GFP_KERNEL | __GFP_ZERO); + else + return kzalloc(HV_HYP_PAGE_SIZE, GFP_KERNEL); +} + +void hv_free_hyperv_page(unsigned long addr) +{ + if (PAGE_SIZE == HV_HYP_PAGE_SIZE) + free_page(addr); + else + kfree((void *)addr); +} + +/* * hv_post_message - Post a message using the hypervisor message IPC. * * This involves a hypercall. @@ -68,7 +105,7 @@ int hv_post_message(union hv_connection_id connection_id, */ put_cpu_ptr(hv_cpu); - return status & 0xFFFF; + return hv_result(status); } int hv_synic_alloc(void) @@ -162,34 +199,48 @@ void hv_synic_enable_regs(unsigned int cpu) union hv_synic_scontrol sctrl; /* Setup the Synic's message page */ - hv_get_simp(simp.as_uint64); + simp.as_uint64 = hv_get_register(HV_REGISTER_SIMP); simp.simp_enabled = 1; simp.base_simp_gpa = virt_to_phys(hv_cpu->synic_message_page) >> HV_HYP_PAGE_SHIFT; - hv_set_simp(simp.as_uint64); + hv_set_register(HV_REGISTER_SIMP, simp.as_uint64); /* Setup the Synic's event page */ - hv_get_siefp(siefp.as_uint64); + siefp.as_uint64 = hv_get_register(HV_REGISTER_SIEFP); siefp.siefp_enabled = 1; siefp.base_siefp_gpa = virt_to_phys(hv_cpu->synic_event_page) >> HV_HYP_PAGE_SHIFT; - hv_set_siefp(siefp.as_uint64); + hv_set_register(HV_REGISTER_SIEFP, siefp.as_uint64); /* Setup the shared SINT. */ - hv_get_synint_state(VMBUS_MESSAGE_SINT, shared_sint.as_uint64); + if (vmbus_irq != -1) + enable_percpu_irq(vmbus_irq, 0); + shared_sint.as_uint64 = hv_get_register(HV_REGISTER_SINT0 + + VMBUS_MESSAGE_SINT); - shared_sint.vector = hv_get_vector(); + shared_sint.vector = vmbus_interrupt; shared_sint.masked = false; - shared_sint.auto_eoi = hv_recommend_using_aeoi(); - hv_set_synint_state(VMBUS_MESSAGE_SINT, shared_sint.as_uint64); + + /* + * On architectures where Hyper-V doesn't support AEOI (e.g., ARM64), + * it doesn't provide a recommendation flag and AEOI must be disabled. + */ +#ifdef HV_DEPRECATING_AEOI_RECOMMENDED + shared_sint.auto_eoi = + !(ms_hyperv.hints & HV_DEPRECATING_AEOI_RECOMMENDED); +#else + shared_sint.auto_eoi = 0; +#endif + hv_set_register(HV_REGISTER_SINT0 + VMBUS_MESSAGE_SINT, + shared_sint.as_uint64); /* Enable the global synic bit */ - hv_get_synic_state(sctrl.as_uint64); + sctrl.as_uint64 = hv_get_register(HV_REGISTER_SCONTROL); sctrl.enable = 1; - hv_set_synic_state(sctrl.as_uint64); + hv_set_register(HV_REGISTER_SCONTROL, sctrl.as_uint64); } int hv_synic_init(unsigned int cpu) @@ -211,30 +262,71 @@ void hv_synic_disable_regs(unsigned int cpu) union hv_synic_siefp siefp; union hv_synic_scontrol sctrl; - hv_get_synint_state(VMBUS_MESSAGE_SINT, shared_sint.as_uint64); + shared_sint.as_uint64 = hv_get_register(HV_REGISTER_SINT0 + + VMBUS_MESSAGE_SINT); shared_sint.masked = 1; /* Need to correctly cleanup in the case of SMP!!! */ /* Disable the interrupt */ - hv_set_synint_state(VMBUS_MESSAGE_SINT, shared_sint.as_uint64); + hv_set_register(HV_REGISTER_SINT0 + VMBUS_MESSAGE_SINT, + shared_sint.as_uint64); - hv_get_simp(simp.as_uint64); + simp.as_uint64 = hv_get_register(HV_REGISTER_SIMP); simp.simp_enabled = 0; simp.base_simp_gpa = 0; - hv_set_simp(simp.as_uint64); + hv_set_register(HV_REGISTER_SIMP, simp.as_uint64); - hv_get_siefp(siefp.as_uint64); + siefp.as_uint64 = hv_get_register(HV_REGISTER_SIEFP); siefp.siefp_enabled = 0; siefp.base_siefp_gpa = 0; - hv_set_siefp(siefp.as_uint64); + hv_set_register(HV_REGISTER_SIEFP, siefp.as_uint64); /* Disable the global synic bit */ - hv_get_synic_state(sctrl.as_uint64); + sctrl.as_uint64 = hv_get_register(HV_REGISTER_SCONTROL); sctrl.enable = 0; - hv_set_synic_state(sctrl.as_uint64); + hv_set_register(HV_REGISTER_SCONTROL, sctrl.as_uint64); + + if (vmbus_irq != -1) + disable_percpu_irq(vmbus_irq); +} + +#define HV_MAX_TRIES 3 +/* + * Scan the event flags page of 'this' CPU looking for any bit that is set. If we find one + * bit set, then wait for a few milliseconds. Repeat these steps for a maximum of 3 times. + * Return 'true', if there is still any set bit after this operation; 'false', otherwise. + * + * If a bit is set, that means there is a pending channel interrupt. The expectation is + * that the normal interrupt handling mechanism will find and process the channel interrupt + * "very soon", and in the process clear the bit. + */ +static bool hv_synic_event_pending(void) +{ + struct hv_per_cpu_context *hv_cpu = this_cpu_ptr(hv_context.cpu_context); + union hv_synic_event_flags *event = + (union hv_synic_event_flags *)hv_cpu->synic_event_page + VMBUS_MESSAGE_SINT; + unsigned long *recv_int_page = event->flags; /* assumes VMBus version >= VERSION_WIN8 */ + bool pending; + u32 relid; + int tries = 0; + +retry: + pending = false; + for_each_set_bit(relid, recv_int_page, HV_EVENT_FLAGS_COUNT) { + /* Special case - VMBus channel protocol messages */ + if (relid == 0) + continue; + pending = true; + break; + } + if (pending && tries++ < HV_MAX_TRIES) { + usleep_range(10000, 20000); + goto retry; + } + return pending; } int hv_synic_cleanup(unsigned int cpu) @@ -242,6 +334,9 @@ int hv_synic_cleanup(unsigned int cpu) struct vmbus_channel *channel, *sc; bool channel_found = false; + if (vmbus_connection.conn_state != CONNECTED) + goto always_cleanup; + /* * Hyper-V does not provide a way to change the connect CPU once * it is set; we must prevent the connect CPU from going offline @@ -249,8 +344,7 @@ int hv_synic_cleanup(unsigned int cpu) * path where the vmbus is already disconnected, the CPU must be * allowed to shut down. */ - if (cpu == VMBUS_CONNECT_CPU && - vmbus_connection.conn_state == CONNECTED) + if (cpu == VMBUS_CONNECT_CPU) return -EBUSY; /* @@ -277,9 +371,21 @@ int hv_synic_cleanup(unsigned int cpu) } mutex_unlock(&vmbus_connection.channel_mutex); - if (channel_found && vmbus_connection.conn_state == CONNECTED) + if (channel_found) + return -EBUSY; + + /* + * channel_found == false means that any channels that were previously + * assigned to the CPU have been reassigned elsewhere with a call of + * vmbus_send_modifychannel(). Scan the event flags page looking for + * bits that are set and waiting with a timeout for vmbus_chan_sched() + * to process such bits. If bits are still set after this operation + * and VMBus is connected, fail the CPU offlining operation. + */ + if (vmbus_proto_version >= VERSION_WIN10_V4_1 && hv_synic_event_pending()) return -EBUSY; +always_cleanup: hv_stimer_legacy_cleanup(cpu); hv_synic_disable_regs(cpu); diff --git a/drivers/hv/hv_balloon.c b/drivers/hv/hv_balloon.c index 2f776d78e3c1..58af84e30144 100644 --- a/drivers/hv/hv_balloon.c +++ b/drivers/hv/hv_balloon.c @@ -21,6 +21,7 @@ #include <linux/memory.h> #include <linux/notifier.h> #include <linux/percpu_counter.h> +#include <linux/page_reporting.h> #include <linux/hyperv.h> #include <asm/hyperv-tlfs.h> @@ -563,6 +564,8 @@ struct hv_dynmem_device { * The negotiated version agreed by host. */ __u32 version; + + struct page_reporting_dev_info pr_dev_info; }; static struct hv_dynmem_device dm_device; @@ -1568,6 +1571,89 @@ static void balloon_onchannelcallback(void *context) } +/* Hyper-V only supports reporting 2MB pages or higher */ +#define HV_MIN_PAGE_REPORTING_ORDER 9 +#define HV_MIN_PAGE_REPORTING_LEN (HV_HYP_PAGE_SIZE << HV_MIN_PAGE_REPORTING_ORDER) +static int hv_free_page_report(struct page_reporting_dev_info *pr_dev_info, + struct scatterlist *sgl, unsigned int nents) +{ + unsigned long flags; + struct hv_memory_hint *hint; + int i; + u64 status; + struct scatterlist *sg; + + WARN_ON_ONCE(nents > HV_MEMORY_HINT_MAX_GPA_PAGE_RANGES); + WARN_ON_ONCE(sgl->length < HV_MIN_PAGE_REPORTING_LEN); + local_irq_save(flags); + hint = *(struct hv_memory_hint **)this_cpu_ptr(hyperv_pcpu_input_arg); + if (!hint) { + local_irq_restore(flags); + return -ENOSPC; + } + + hint->type = HV_EXT_MEMORY_HEAT_HINT_TYPE_COLD_DISCARD; + hint->reserved = 0; + for_each_sg(sgl, sg, nents, i) { + union hv_gpa_page_range *range; + + range = &hint->ranges[i]; + range->address_space = 0; + /* page reporting only reports 2MB pages or higher */ + range->page.largepage = 1; + range->page.additional_pages = + (sg->length / HV_MIN_PAGE_REPORTING_LEN) - 1; + range->page_size = HV_GPA_PAGE_RANGE_PAGE_SIZE_2MB; + range->base_large_pfn = + page_to_hvpfn(sg_page(sg)) >> HV_MIN_PAGE_REPORTING_ORDER; + } + + status = hv_do_rep_hypercall(HV_EXT_CALL_MEMORY_HEAT_HINT, nents, 0, + hint, NULL); + local_irq_restore(flags); + if ((status & HV_HYPERCALL_RESULT_MASK) != HV_STATUS_SUCCESS) { + pr_err("Cold memory discard hypercall failed with status %llx\n", + status); + return -EINVAL; + } + + return 0; +} + +static void enable_page_reporting(void) +{ + int ret; + + /* Essentially, validating 'PAGE_REPORTING_MIN_ORDER' is big enough. */ + if (pageblock_order < HV_MIN_PAGE_REPORTING_ORDER) { + pr_debug("Cold memory discard is only supported on 2MB pages and above\n"); + return; + } + + if (!hv_query_ext_cap(HV_EXT_CAPABILITY_MEMORY_COLD_DISCARD_HINT)) { + pr_debug("Cold memory discard hint not supported by Hyper-V\n"); + return; + } + + BUILD_BUG_ON(PAGE_REPORTING_CAPACITY > HV_MEMORY_HINT_MAX_GPA_PAGE_RANGES); + dm_device.pr_dev_info.report = hv_free_page_report; + ret = page_reporting_register(&dm_device.pr_dev_info); + if (ret < 0) { + dm_device.pr_dev_info.report = NULL; + pr_err("Failed to enable cold memory discard: %d\n", ret); + } else { + pr_info("Cold memory discard hint enabled\n"); + } +} + +static void disable_page_reporting(void) +{ + if (dm_device.pr_dev_info.report) { + page_reporting_unregister(&dm_device.pr_dev_info); + dm_device.pr_dev_info.report = NULL; + } +} + static int balloon_connect_vsp(struct hv_device *dev) { struct dm_version_request version_req; @@ -1713,6 +1799,7 @@ static int balloon_probe(struct hv_device *dev, if (ret != 0) return ret; + enable_page_reporting(); dm_device.state = DM_INITIALIZED; dm_device.thread = @@ -1727,6 +1814,7 @@ static int balloon_probe(struct hv_device *dev, probe_error: dm_device.state = DM_INIT_ERROR; dm_device.thread = NULL; + disable_page_reporting(); vmbus_close(dev->channel); #ifdef CONFIG_MEMORY_HOTPLUG unregister_memory_notifier(&hv_memory_nb); @@ -1749,6 +1837,7 @@ static int balloon_remove(struct hv_device *dev) cancel_work_sync(&dm->ha_wrk.wrk); kthread_stop(dm->thread); + disable_page_reporting(); vmbus_close(dev->channel); #ifdef CONFIG_MEMORY_HOTPLUG unregister_memory_notifier(&hv_memory_nb); diff --git a/drivers/hv/hv_trace.h b/drivers/hv/hv_trace.h index 6063bb21bb13..c02a1719e92f 100644 --- a/drivers/hv/hv_trace.h +++ b/drivers/hv/hv_trace.h @@ -103,6 +103,21 @@ TRACE_EVENT(vmbus_ongpadl_created, ) ); +TRACE_EVENT(vmbus_onmodifychannel_response, + TP_PROTO(const struct vmbus_channel_modifychannel_response *response), + TP_ARGS(response), + TP_STRUCT__entry( + __field(u32, child_relid) + __field(u32, status) + ), + TP_fast_assign(__entry->child_relid = response->child_relid; + __entry->status = response->status; + ), + TP_printk("child_relid 0x%x, status %d", + __entry->child_relid, __entry->status + ) + ); + TRACE_EVENT(vmbus_ongpadl_torndown, TP_PROTO(const struct vmbus_channel_gpadl_torndown *gpadltorndown), TP_ARGS(gpadltorndown), diff --git a/drivers/hv/ring_buffer.c b/drivers/hv/ring_buffer.c index 35833d4d1a1d..374f8afbf8a5 100644 --- a/drivers/hv/ring_buffer.c +++ b/drivers/hv/ring_buffer.c @@ -84,15 +84,6 @@ hv_set_next_write_location(struct hv_ring_buffer_info *ring_info, ring_info->ring_buffer->write_index = next_write_location; } -/* Set the next read location for the specified ring buffer. */ -static inline void -hv_set_next_read_location(struct hv_ring_buffer_info *ring_info, - u32 next_read_location) -{ - ring_info->ring_buffer->read_index = next_read_location; - ring_info->priv_read_index = next_read_location; -} - /* Get the size of the ring buffer. */ static inline u32 hv_get_ring_buffersize(const struct hv_ring_buffer_info *ring_info) @@ -313,7 +304,6 @@ int hv_ringbuffer_write(struct vmbus_channel *channel, rqst_id = vmbus_next_request_id(&channel->requestor, requestid); if (rqst_id == VMBUS_RQST_ERROR) { spin_unlock_irqrestore(&outring_info->ring_lock, flags); - pr_err("No request id available\n"); return -EAGAIN; } } diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c index 10dce9f91216..b12d6827b222 100644 --- a/drivers/hv/vmbus_drv.c +++ b/drivers/hv/vmbus_drv.c @@ -48,8 +48,10 @@ static int hyperv_cpuhp_online; static void *hv_panic_page; +static long __percpu *vmbus_evt; + /* Values parsed from ACPI DSDT */ -static int vmbus_irq; +int vmbus_irq; int vmbus_interrupt; /* @@ -1381,7 +1383,13 @@ static void vmbus_isr(void) tasklet_schedule(&hv_cpu->msg_dpc); } - add_interrupt_randomness(hv_get_vector(), 0); + add_interrupt_randomness(vmbus_interrupt, 0); +} + +static irqreturn_t vmbus_percpu_isr(int irq, void *dev_id) +{ + vmbus_isr(); + return IRQ_HANDLED; } /* @@ -1392,22 +1400,36 @@ static void hv_kmsg_dump(struct kmsg_dumper *dumper, enum kmsg_dump_reason reason) { size_t bytes_written; - phys_addr_t panic_pa; /* We are only interested in panics. */ if ((reason != KMSG_DUMP_PANIC) || (!sysctl_record_panic_msg)) return; - panic_pa = virt_to_phys(hv_panic_page); - /* * Write dump contents to the page. No need to synchronize; panic should * be single-threaded. */ kmsg_dump_get_buffer(dumper, false, hv_panic_page, HV_HYP_PAGE_SIZE, &bytes_written); - if (bytes_written) - hyperv_report_panic_msg(panic_pa, bytes_written); + if (!bytes_written) + return; + /* + * P3 to contain the physical address of the panic page & P4 to + * contain the size of the panic data in that page. Rest of the + * registers are no-op when the NOTIFY_MSG flag is set. + */ + hv_set_register(HV_REGISTER_CRASH_P0, 0); + hv_set_register(HV_REGISTER_CRASH_P1, 0); + hv_set_register(HV_REGISTER_CRASH_P2, 0); + hv_set_register(HV_REGISTER_CRASH_P3, virt_to_phys(hv_panic_page)); + hv_set_register(HV_REGISTER_CRASH_P4, bytes_written); + + /* + * Let Hyper-V know there is crash data available along with + * the panic message. + */ + hv_set_register(HV_REGISTER_CRASH_CTL, + (HV_CRASH_CTL_CRASH_NOTIFY | HV_CRASH_CTL_CRASH_NOTIFY_MSG)); } static struct kmsg_dumper hv_kmsg_dumper = { @@ -1482,9 +1504,28 @@ static int vmbus_bus_init(void) if (ret) return ret; - ret = hv_setup_vmbus_irq(vmbus_irq, vmbus_isr); - if (ret) - goto err_setup; + /* + * VMbus interrupts are best modeled as per-cpu interrupts. If + * on an architecture with support for per-cpu IRQs (e.g. ARM64), + * allocate a per-cpu IRQ using standard Linux kernel functionality. + * If not on such an architecture (e.g., x86/x64), then rely on + * code in the arch-specific portion of the code tree to connect + * the VMbus interrupt handler. + */ + + if (vmbus_irq == -1) { + hv_setup_vmbus_handler(vmbus_isr); + } else { + vmbus_evt = alloc_percpu(long); + ret = request_percpu_irq(vmbus_irq, vmbus_percpu_isr, + "Hyper-V VMbus", vmbus_evt); + if (ret) { + pr_err("Can't request Hyper-V VMbus IRQ %d, Err %d", + vmbus_irq, ret); + free_percpu(vmbus_evt); + goto err_setup; + } + } ret = hv_synic_alloc(); if (ret) @@ -1521,7 +1562,7 @@ static int vmbus_bus_init(void) * Register for panic kmsg callback only if the right * capability is supported by the hypervisor. */ - hv_get_crash_ctl(hyperv_crash_ctl); + hyperv_crash_ctl = hv_get_register(HV_REGISTER_CRASH_CTL); if (hyperv_crash_ctl & HV_CRASH_CTL_CRASH_NOTIFY_MSG) hv_kmsg_dump_register(); @@ -1545,7 +1586,12 @@ err_connect: err_cpuhp: hv_synic_free(); err_alloc: - hv_remove_vmbus_irq(); + if (vmbus_irq == -1) { + hv_remove_vmbus_handler(); + } else { + free_percpu_irq(vmbus_irq, vmbus_evt); + free_percpu(vmbus_evt); + } err_setup: bus_unregister(&hv_bus); unregister_sysctl_table(hv_ctl_table_hdr); @@ -1802,13 +1848,15 @@ static ssize_t target_cpu_store(struct vmbus_channel *channel, if (target_cpu == origin_cpu) goto cpu_store_unlock; - if (vmbus_send_modifychannel(channel->offermsg.child_relid, + if (vmbus_send_modifychannel(channel, hv_cpu_number_to_vp_number(target_cpu))) { ret = -EIO; goto cpu_store_unlock; } /* + * For version before VERSION_WIN10_V5_3, the following warning holds: + * * Warning. At this point, there is *no* guarantee that the host will * have successfully processed the vmbus_send_modifychannel() request. * See the header comment of vmbus_send_modifychannel() for more info. @@ -2663,6 +2711,18 @@ static int __init hv_acpi_init(void) ret = -ETIMEDOUT; goto cleanup; } + + /* + * If we're on an architecture with a hardcoded hypervisor + * vector (i.e. x86/x64), override the VMbus interrupt found + * in the ACPI tables. Ensure vmbus_irq is not set since the + * normal Linux IRQ mechanism is not used in this case. + */ +#ifdef HYPERVISOR_CALLBACK_VECTOR + vmbus_interrupt = HYPERVISOR_CALLBACK_VECTOR; + vmbus_irq = -1; +#endif + hv_debug_init(); ret = vmbus_bus_init(); @@ -2693,7 +2753,12 @@ static void __exit vmbus_exit(void) vmbus_connection.conn_state = DISCONNECTED; hv_stimer_global_cleanup(); vmbus_disconnect(); - hv_remove_vmbus_irq(); + if (vmbus_irq == -1) { + hv_remove_vmbus_handler(); + } else { + free_percpu_irq(vmbus_irq, vmbus_evt); + free_percpu(vmbus_evt); + } for_each_online_cpu(cpu) { struct hv_per_cpu_context *hv_cpu = per_cpu_ptr(hv_context.cpu_context, cpu); diff --git a/drivers/pci/controller/pci-hyperv.c b/drivers/pci/controller/pci-hyperv.c index a313708bcf75..1ff4ce24f4b3 100644 --- a/drivers/pci/controller/pci-hyperv.c +++ b/drivers/pci/controller/pci-hyperv.c @@ -1292,7 +1292,7 @@ exit_unlock: * resumes, hv_pci_restore_msi_state() is able to correctly restore * the interrupt with the correct affinity. */ - if (res && hbus->state != hv_pcibus_removing) + if (!hv_result_success(res) && hbus->state != hv_pcibus_removing) dev_err(&hbus->hdev->device, "%s() failed: %#llx", __func__, res); diff --git a/drivers/video/fbdev/hyperv_fb.c b/drivers/video/fbdev/hyperv_fb.c index 4dc9077dd2ac..a7e6eea2c4a1 100644 --- a/drivers/video/fbdev/hyperv_fb.c +++ b/drivers/video/fbdev/hyperv_fb.c @@ -308,7 +308,7 @@ static inline int synthvid_send(struct hv_device *hdev, VM_PKT_DATA_INBAND, 0); if (ret) - pr_err("Unable to send packet via vmbus\n"); + pr_err_ratelimited("Unable to send packet via vmbus; error %d\n", ret); return ret; } |